8 months ago · 8aa97b5e83
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -61,12 +61,6 @@ type Sequence struct {
 
				 	n_prompt_tokens        int
			
 
				 }
			
 
				 
			
 
				-// prompt returns true if the prompt is still being processed
			
 
				-// TODO (jmorganca): clean up this logic
			
 
				-func (s *Sequence) prompt() bool {
			
 
				-	return s.nPast < len(s.tokens)-1
			
 
				-}
			
 
				-
			
 
				 func (s *Server) NewSequence(prompt string, numPredict int, stop []string, params *llama.SamplingParams, embedding bool) *Sequence {
			
 
				 	tokens, err := s.lc.Model().Tokenize(prompt, true, true)
			
 
				 	if err != nil {
			
@@ -176,14 +170,17 @@ func (s *Server) run(ctx context.Context) {
 
				 					seq.t_start_process_prompt = time.Now()
			
 
				 				}
			
 
				 
			
 
				+				var numTokensProcessed int
			
 
				 				for j, t := range seq.tokens {
			
 
				 					// todo: make this n_batch
			
 
				 					if j >= s.batchSize {
			
 
				 						break
			
 
				 					}
			
 
				-					batch.Add(t, seq.nPast, []int{i}, !seq.prompt())
			
 
				+					batch.Add(t, seq.nPast, []int{i}, numTokensProcessed+1 == len(seq.tokens))
			
 
				 					seq.nPast++
			
 
				+					numTokensProcessed++
			
 
				 				}
			
 
				+				seq.tokens = seq.tokens[numTokensProcessed:]
			
 
				 				seq.iBatch = batch.NumTokens() - 1
			
 
				 			}
			
 
				 
			
@@ -199,7 +196,7 @@ func (s *Server) run(ctx context.Context) {
 
				 				}
			
 
				 
			
 
				 				// don't sample prompt processing
			
 
				-				if seq.prompt() {
			
 
				+				if len(seq.tokens) != 0 {
			
 
				 					continue
			
 
				 				}