2 months ago · 64f95067ba
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -50,8 +50,9 @@ type Sequence struct {
 
				 	// inputs that have been added to a batch but not yet submitted to Decode
			
 
				 	pendingInputs []input
			
 
				 
			
 
				+	// TODO: update this comment
			
 
				 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
			
 
				-	pendingResponses []string
			
 
				+	pendingResponses []CompletionResponse
			
 
				 
			
 
				 	// input cache being used by this sequence
			
 
				 	cache *InputCacheSlot
			
@@ -87,6 +88,9 @@ type Sequence struct {
 
				 
			
 
				 	logits []float32
			
 
				 
			
 
				+	// number of logprobs to return with the completion response
			
 
				+	logprobs int
			
 
				+
			
 
				 	// Metrics
			
 
				 	startProcessingTime time.Time
			
 
				 	startGenerationTime time.Time
			
@@ -152,7 +156,7 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 
				 		numPromptInputs:     len(inputs),
			
 
				 		startProcessingTime: startTime,
			
 
				 		numPredict:          params.numPredict,
			
 
				-		pendingResponses:    make([]string, 0),
			
 
				+		pendingResponses:    make([]CompletionResponse, 0),
			
 
				 		responses:           make(chan CompletionResponse, 100),
			
 
				 		quit:                make(chan bool, 1),
			
 
				 		embedding:           make(chan []float32, 1),
			
@@ -281,8 +285,11 @@ func flushPending(seq *Sequence) bool {
 
				 	if len(seq.pendingResponses) == 0 {
			
 
				 		return true
			
 
				 	}
			
 
				-	content := strings.Join(seq.pendingResponses, "")
			
 
				-	seq.pendingResponses = []string{}
			
 
				+	content := ""
			
 
				+	for _, resp := range seq.pendingResponses {
			
 
				+		content += resp.Content
			
 
				+	}
			
 
				+	seq.pendingResponses = []CompletionResponse{}
			
 
				 
			
 
				 	// Check if there are any partial UTF-8 characters remaining.
			
 
				 	// We already check and queue as we are generating but some may
			
@@ -362,27 +369,27 @@ func (s *Server) run(ctx context.Context) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// TokenData represents probability information for a token
			
 
				-type TokenData struct {
			
 
				+// TokenProbs represents probability information for a token
			
 
				+type TokenProbs struct {
			
 
				 	TokenID int
			
 
				 	Logit   float32
			
 
				 	Prob    float32
			
 
				 	LogProb float32
			
 
				 }
			
 
				 
			
 
				-// getTokenProbabilities returns sorted token probabilities for a specific token index
			
 
				-func (s *Server) getTokenProbabilities(seq *Sequence) []TokenData {
			
 
				+// probs returns sorted token probabilities for a specific token index
			
 
				+func (s *Server) probs(seq *Sequence) []TokenProbs {
			
 
				 	// Get logits for the specific token index
			
 
				 	logits := s.lc.GetLogits()
			
 
				 	seq.logits = make([]float32, len(logits))
			
 
				 	copy(seq.logits, logits)
			
 
				 
			
 
				 	vocabSize := s.model.NumVocab()
			
 
				-	probs := make([]TokenData, vocabSize)
			
 
				+	probs := make([]TokenProbs, vocabSize)
			
 
				 
			
 
				 	// Initialize token data with logits
			
 
				 	for i := 0; i < vocabSize; i++ {
			
 
				-		probs[i] = TokenData{
			
 
				+		probs[i] = TokenProbs{
			
 
				 			TokenID: i,
			
 
				 			Logit:   logits[i],
			
 
				 		}
			
@@ -546,10 +553,9 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
				 
			
 
				 		seq.numPredicted++
			
 
				 
			
 
				-		// TODO: only do this when flag specified
			
 
				-		probs := s.getTokenProbabilities(seq)
			
 
				-		for i := range 10 {
			
 
				-			slog.Debug("top 10 tokens", "token", probs[i].TokenID, "prob", probs[i].Prob, "logit", probs[i].Logit, "piece", s.model.TokenToPiece(probs[i].TokenID))
			
 
				+		if seq.logprobs > 0 {
			
 
				+			// TODO: return selected token in logprobs always
			
 
				+			// probs := s.probs(seq)
			
 
				 		}
			
 
				 
			
 
				 		// if it's an end of sequence token, break
			
@@ -564,8 +570,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
				 
			
 
				 		seq.inputs = []input{{token: token}}
			
 
				 
			
 
				-		seq.pendingResponses = append(seq.pendingResponses, piece)
			
 
				-		sequence := strings.Join(seq.pendingResponses, "")
			
 
				+		// TODO: add probs here
			
 
				+		seq.pendingResponses = append(seq.pendingResponses, CompletionResponse{Content: piece})
			
 
				+		var sequence string
			
 
				+		for _, r := range seq.pendingResponses {
			
 
				+			sequence += r.Content
			
 
				+		}
			
 
				 
			
 
				 		if ok, stop := findStop(sequence, seq.stop); ok {
			
 
				 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
			
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -29,40 +29,43 @@ func containsStopSuffix(sequence string, stops []string) bool {
 
				 // truncateStop removes the provided stop string from pieces,
			
 
				 // returning the partial pieces with stop removed, including truncating
			
 
				 // the last piece if required (and signalling if this was the case)
			
 
				-func truncateStop(pieces []string, stop string) ([]string, bool) {
			
 
				-	joined := strings.Join(pieces, "")
			
 
				+func truncateStop(pieces []CompletionResponse, stop string) ([]CompletionResponse, bool) {
			
 
				+	// Build complete string and find stop position
			
 
				+	var completeStr string
			
 
				+	for _, piece := range pieces {
			
 
				+		completeStr += piece.Content
			
 
				+	}
			
 
				 
			
 
				-	index := strings.Index(joined, stop)
			
 
				-	if index == -1 {
			
 
				+	stopStart := strings.Index(completeStr, stop)
			
 
				+	if stopStart == -1 {
			
 
				 		return pieces, false
			
 
				 	}
			
 
				 
			
 
				-	joined = joined[:index]
			
 
				-
			
 
				-	// Split truncated string back into pieces of original lengths
			
 
				-	lengths := make([]int, len(pieces))
			
 
				-	for i, piece := range pieces {
			
 
				-		lengths[i] = len(piece)
			
 
				-	}
			
 
				+	// Build result up to stop position
			
 
				+	result := make([]CompletionResponse, 0)
			
 
				+	accumulated := 0
			
 
				 
			
 
				-	var result []string
			
 
				-	tokenTruncated := false
			
 
				-	start := 0
			
 
				-	for _, length := range lengths {
			
 
				-		if start >= len(joined) {
			
 
				-			break
			
 
				+	truncated := false
			
 
				+	for _, piece := range pieces {
			
 
				+		if accumulated+len(piece.Content) <= stopStart {
			
 
				+			result = append(result, piece)
			
 
				+			accumulated += len(piece.Content)
			
 
				+			continue
			
 
				 		}
			
 
				 
			
 
				-		end := start + length
			
 
				-		if end > len(joined) {
			
 
				-			end = len(joined)
			
 
				-			tokenTruncated = true
			
 
				+		if accumulated < stopStart {
			
 
				+			truncPiece := piece
			
 
				+			truncPiece.Content = piece.Content[:stopStart-accumulated]
			
 
				+			if len(truncPiece.Content) > 0 {
			
 
				+				result = append(result, truncPiece)
			
 
				+				truncated = true
			
 
				+			}
			
 
				 		}
			
 
				-		result = append(result, joined[start:end])
			
 
				-		start = end
			
 
				+		break
			
 
				 	}
			
 
				 
			
 
				-	return result, tokenTruncated
			
 
				+	// Signal if we had to truncate the last piece
			
 
				+	return result, truncated
			
 
				 }
			
 
				 
			
 
				 func incompleteUnicode(token string) bool {
			
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -8,44 +8,74 @@ import (
 
				 func TestTruncateStop(t *testing.T) {
			
 
				 	tests := []struct {
			
 
				 		name          string
			
 
				-		pieces        []string
			
 
				+		pieces        []CompletionResponse
			
 
				 		stop          string
			
 
				-		expected      []string
			
 
				+		expected      []CompletionResponse
			
 
				 		expectedTrunc bool
			
 
				 	}{
			
 
				 		{
			
 
				-			name:          "Single word",
			
 
				-			pieces:        []string{"hello", "world"},
			
 
				-			stop:          "world",
			
 
				-			expected:      []string{"hello"},
			
 
				+			name: "Single word",
			
 
				+			pieces: []CompletionResponse{
			
 
				+				{Content: "hello"},
			
 
				+				{Content: "world"},
			
 
				+			},
			
 
				+			stop: "world",
			
 
				+			expected: []CompletionResponse{
			
 
				+				{Content: "hello"},
			
 
				+			},
			
 
				 			expectedTrunc: false,
			
 
				 		},
			
 
				 		{
			
 
				-			name:          "Partial",
			
 
				-			pieces:        []string{"hello", "wor"},
			
 
				-			stop:          "or",
			
 
				-			expected:      []string{"hello", "w"},
			
 
				+			name: "Partial",
			
 
				+			pieces: []CompletionResponse{
			
 
				+				{Content: "hello"},
			
 
				+				{Content: "wor"},
			
 
				+			},
			
 
				+			stop: "or",
			
 
				+			expected: []CompletionResponse{
			
 
				+				{Content: "hello"},
			
 
				+				{Content: "w"},
			
 
				+			},
			
 
				 			expectedTrunc: true,
			
 
				 		},
			
 
				 		{
			
 
				-			name:          "Suffix",
			
 
				-			pieces:        []string{"Hello", " there", "!"},
			
 
				-			stop:          "!",
			
 
				-			expected:      []string{"Hello", " there"},
			
 
				+			name: "Suffix",
			
 
				+			pieces: []CompletionResponse{
			
 
				+				{Content: "Hello"},
			
 
				+				{Content: " there"},
			
 
				+				{Content: "!"},
			
 
				+			},
			
 
				+			stop: "!",
			
 
				+			expected: []CompletionResponse{
			
 
				+				{Content: "Hello"},
			
 
				+				{Content: " there"},
			
 
				+			},
			
 
				 			expectedTrunc: false,
			
 
				 		},
			
 
				 		{
			
 
				-			name:          "Suffix partial",
			
 
				-			pieces:        []string{"Hello", " the", "re!"},
			
 
				-			stop:          "there!",
			
 
				-			expected:      []string{"Hello", " "},
			
 
				+			name: "Suffix partial",
			
 
				+			pieces: []CompletionResponse{
			
 
				+				{Content: "Hello"},
			
 
				+				{Content: " the"},
			
 
				+				{Content: "re!"},
			
 
				+			},
			
 
				+			stop: "there!",
			
 
				+			expected: []CompletionResponse{
			
 
				+				{Content: "Hello"},
			
 
				+				{Content: " "},
			
 
				+			},
			
 
				 			expectedTrunc: true,
			
 
				 		},
			
 
				 		{
			
 
				-			name:          "Middle",
			
 
				-			pieces:        []string{"hello", " wor"},
			
 
				-			stop:          "llo w",
			
 
				-			expected:      []string{"he"},
			
 
				+			name: "Middle",
			
 
				+			pieces: []CompletionResponse{
			
 
				+				{Content: "hello"},
			
 
				+				{Content: " wor"},
			
 
				+			},
			
 
				+			stop: "llo w",
			
 
				+			expected: []CompletionResponse{
			
 
				+				{Content: "he"},
			
 
				+			},
			
 
				 			expectedTrunc: true,
			
 
				 		},
			
 
				 	}