2 tháng trước cách đây · fdbb0b5cfe
--- a/api/types.go
+++ b/api/types.go
@@ -77,6 +77,8 @@ type GenerateRequest struct {
 
				 	// request, for multimodal models.
			
 
				 	Images []ImageData `json:"images,omitempty"`
			
 
				 
			
 
				+	LogProbs int `json:"logprobs,omitempty"`
			
 
				+
			
 
				 	// Options lists model-specific options. For example, temperature can be
			
 
				 	// set through this field, if the model supports it.
			
 
				 	Options map[string]interface{} `json:"options"`
			
@@ -103,6 +105,8 @@ type ChatRequest struct {
 
				 	// Tools is an optional list of tools the model has access to.
			
 
				 	Tools `json:"tools,omitempty"`
			
 
				 
			
 
				+	LogProbs int `json:"logprobs,omitempty"`
			
 
				+
			
 
				 	// Options lists model-specific options.
			
 
				 	Options map[string]interface{} `json:"options"`
			
 
				 }
			
@@ -182,13 +186,20 @@ func (t *ToolFunction) String() string {
 
				 	return string(bts)
			
 
				 }
			
 
				 
			
 
				+type TokenProbs struct {
			
 
				+	TokenID int     `json:"id"`
			
 
				+	LogProb float32 `json:"logprob"`
			
 
				+	Token   string  `json:"token"`
			
 
				+}
			
 
				+
			
 
				 // ChatResponse is the response returned by [Client.Chat]. Its fields are
			
 
				 // similar to [GenerateResponse].
			
 
				 type ChatResponse struct {
			
 
				-	Model      string    `json:"model"`
			
 
				-	CreatedAt  time.Time `json:"created_at"`
			
 
				-	Message    Message   `json:"message"`
			
 
				-	DoneReason string    `json:"done_reason,omitempty"`
			
 
				+	Model      string       `json:"model"`
			
 
				+	CreatedAt  time.Time    `json:"created_at"`
			
 
				+	Message    Message      `json:"message"`
			
 
				+	DoneReason string       `json:"done_reason,omitempty"`
			
 
				+	LogProbs   []TokenProbs `json:"logprobs,omitempty"`
			
 
				 
			
 
				 	Done bool `json:"done"`
			
 
				 
			
@@ -452,6 +463,8 @@ type GenerateResponse struct {
 
				 	// can be sent in the next request to keep a conversational memory.
			
 
				 	Context []int `json:"context,omitempty"`
			
 
				 
			
 
				+	LogProbs []TokenProbs `json:"logprobs,omitempty"`
			
 
				+
			
 
				 	Metrics
			
 
				 }
			
 
				 
			
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -233,18 +233,6 @@ func (c *Context) GetLogits() []float32 {
 
				 	return unsafe.Slice((*float32)(logits), vocabSize)
			
 
				 }
			
 
				 
			
 
				-func (m *Model) Detokenize(tokens []int) (string, error) {
			
 
				-	var text string
			
 
				-	for _, token := range tokens {
			
 
				-		piece := m.TokenToPiece(token)
			
 
				-		if piece == "" {
			
 
				-			return "", fmt.Errorf("failed to convert token %d to piece", token)
			
 
				-		}
			
 
				-		text += piece
			
 
				-	}
			
 
				-	return text, nil
			
 
				-}
			
 
				-
			
 
				 type ModelParams struct {
			
 
				 	NumGpuLayers int
			
 
				 	MainGpu      int
			
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -104,6 +104,7 @@ type NewSequenceParams struct {
 
				 	numKeep        int
			
 
				 	samplingParams *llama.SamplingParams
			
 
				 	embedding      bool
			
 
				+	logprobs       int
			
 
				 }
			
 
				 
			
 
				 func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
			
@@ -164,6 +165,7 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 
				 		embeddingOnly:       params.embedding,
			
 
				 		stop:                params.stop,
			
 
				 		numKeep:             params.numKeep,
			
 
				+		logprobs:            params.logprobs,
			
 
				 	}, nil
			
 
				 }
			
 
				 
			
@@ -285,37 +287,34 @@ func flushPending(seq *Sequence) bool {
 
				 	if len(seq.pendingResponses) == 0 {
			
 
				 		return true
			
 
				 	}
			
 
				-	content := ""
			
 
				+	resps := []CompletionResponse{}
			
 
				 	for _, resp := range seq.pendingResponses {
			
 
				-		content += resp.Content
			
 
				+		resps = append(resps, resp)
			
 
				 	}
			
 
				 	seq.pendingResponses = []CompletionResponse{}
			
 
				 
			
 
				-	// Check if there are any partial UTF-8 characters remaining.
			
 
				-	// We already check and queue as we are generating but some may
			
 
				-	// still make it here:
			
 
				-	// - Sequence is ending, e.g. generation limit has been hit
			
 
				-	// - Invalid characters in the middle of a string
			
 
				-	// This is a stricter check to ensure we never output invalid Unicode.
			
 
				-	for !utf8.ValidString(content) {
			
 
				-		content = content[:len(content)-1]
			
 
				-	}
			
 
				+	// TODO: figure out this result logic
			
 
				+	result := false
			
 
				+	for _, resp := range resps {
			
 
				+		// Check if there are any partial UTF-8 characters remaining.
			
 
				+		// We already check and queue as we are generating but some may
			
 
				+		// still make it here:
			
 
				+		// - Sequence is ending, e.g. generation limit has been hit
			
 
				+		// - Invalid characters in the middle of a string
			
 
				+		// This is a stricter check to ensure we never output invalid Unicode.
			
 
				+		for !utf8.ValidString(resp.Content) {
			
 
				+			resp.Content = resp.Content[:len(resp.Content)-1]
			
 
				+		}
			
 
				 
			
 
				-	// Add logits if requested and available
			
 
				-	wantLogits := true
			
 
				-	if wantLogits && seq.logits != nil {
			
 
				-		// resp.Logits = seq.logits
			
 
				-		seq.logits = nil
			
 
				+		select {
			
 
				+		case seq.responses <- resp:
			
 
				+			result = true
			
 
				+		case <-seq.quit:
			
 
				+			result = false
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	select {
			
 
				-	case seq.responses <- CompletionResponse{
			
 
				-		Content: content,
			
 
				-	}:
			
 
				-		return true
			
 
				-	case <-seq.quit:
			
 
				-		return false
			
 
				-	}
			
 
				+	return result
			
 
				 }
			
 
				 
			
 
				 func (s *Server) removeSequence(seqIndex int, reason string) {
			
@@ -371,10 +370,11 @@ func (s *Server) run(ctx context.Context) {
 
				 
			
 
				 // TokenProbs represents probability information for a token
			
 
				 type TokenProbs struct {
			
 
				-	TokenID int
			
 
				-	Logit   float32
			
 
				-	Prob    float32
			
 
				-	LogProb float32
			
 
				+	TokenID int     `json:"id"`
			
 
				+	Logit   float32 `json:"logit"`
			
 
				+	Prob    float32 `json:"prob"`
			
 
				+	LogProb float32 `json:"logprob"`
			
 
				+	Token   string  `json:"token"`
			
 
				 }
			
 
				 
			
 
				 // probs returns sorted token probabilities for a specific token index
			
@@ -553,9 +553,17 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
				 
			
 
				 		seq.numPredicted++
			
 
				 
			
 
				+		resp := CompletionResponse{Content: piece}
			
 
				+
			
 
				 		if seq.logprobs > 0 {
			
 
				 			// TODO: return selected token in logprobs always
			
 
				-			// probs := s.probs(seq)
			
 
				+			resp.LogProbs = s.probs(seq)
			
 
				+			// TODO: fix this logprobs limit
			
 
				+			resp.LogProbs = resp.LogProbs[:min(len(resp.LogProbs), seq.logprobs)]
			
 
				+			for i := range resp.LogProbs {
			
 
				+				// decode the token id to a piece
			
 
				+				resp.LogProbs[i].Token = s.model.TokenToPiece(resp.LogProbs[i].TokenID)
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		// if it's an end of sequence token, break
			
@@ -571,7 +579,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
				 		seq.inputs = []input{{token: token}}
			
 
				 
			
 
				 		// TODO: add probs here
			
 
				-		seq.pendingResponses = append(seq.pendingResponses, CompletionResponse{Content: piece})
			
 
				+		seq.pendingResponses = append(seq.pendingResponses, resp)
			
 
				 		var sequence string
			
 
				 		for _, r := range seq.pendingResponses {
			
 
				 			sequence += r.Content
			
@@ -580,10 +588,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
				 		if ok, stop := findStop(sequence, seq.stop); ok {
			
 
				 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
			
 
				 
			
 
				+			// TODO: fix this stop sequence caching
			
 
				 			var tokenTruncated bool
			
 
				-			origLen := len(seq.pendingResponses)
			
 
				-			seq.pendingResponses, tokenTruncated = truncateStop(seq.pendingResponses, stop)
			
 
				-			newLen := len(seq.pendingResponses)
			
 
				+			origLen := len(sequence)
			
 
				+			sequence, tokenTruncated = truncateStop(sequence, stop)
			
 
				+			newLen := len(sequence)
			
 
				 
			
 
				 			// Update the cache based on the tokens that will be returned:
			
 
				 			// - We have 1 token more than is currently in the cache because
			
@@ -654,6 +663,7 @@ type CompletionRequest struct {
 
				 	Images      []ImageData `json:"image_data"`
			
 
				 	Grammar     string      `json:"grammar"`
			
 
				 	CachePrompt bool        `json:"cache_prompt"`
			
 
				+	Logprobs    int         `json:"logprobs,omitempty"`
			
 
				 
			
 
				 	Options
			
 
				 }
			
@@ -669,8 +679,10 @@ type CompletionResponse struct {
 
				 	Content string `json:"content"`
			
 
				 	Stop    bool   `json:"stop"`
			
 
				 
			
 
				-	Model        string  `json:"model,omitempty"`
			
 
				-	Prompt       string  `json:"prompt,omitempty"`
			
 
				+	Model    string       `json:"model,omitempty"`
			
 
				+	Prompt   string       `json:"prompt,omitempty"`
			
 
				+	LogProbs []TokenProbs `json:"logprobs,omitempty"`
			
 
				+
			
 
				 	StoppedLimit bool    `json:"stopped_limit,omitempty"`
			
 
				 	PredictedN   int     `json:"predicted_n,omitempty"`
			
 
				 	PredictedMS  float64 `json:"predicted_ms,omitempty"`
			
@@ -688,10 +700,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				-	// Set the headers to indicate streaming
			
 
				-	w.Header().Set("Content-Type", "application/json")
			
 
				-	w.Header().Set("Transfer-Encoding", "chunked")
			
 
				-
			
 
				 	flusher, ok := w.(http.Flusher)
			
 
				 	if !ok {
			
 
				 		http.Error(w, "Streaming not supported", http.StatusInternalServerError)
			
@@ -720,6 +728,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
				 		numKeep:        req.NumKeep,
			
 
				 		samplingParams: &samplingParams,
			
 
				 		embedding:      false,
			
 
				+		logprobs:       req.Logprobs,
			
 
				 	})
			
 
				 	if err != nil {
			
 
				 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
			
@@ -769,6 +778,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
				 			return
			
 
				 		case resp, ok := <-seq.responses:
			
 
				 			if ok {
			
 
				+				fmt.Println("response", resp)
			
 
				 				if err := json.NewEncoder(w).Encode(&resp); err != nil {
			
 
				 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
			
 
				 					close(seq.quit)
			
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -26,46 +26,15 @@ func containsStopSuffix(sequence string, stops []string) bool {
 
				 	return false
			
 
				 }
			
 
				 
			
 
				-// truncateStop removes the provided stop string from pieces,
			
 
				-// returning the partial pieces with stop removed, including truncating
			
 
				-// the last piece if required (and signalling if this was the case)
			
 
				-func truncateStop(pieces []CompletionResponse, stop string) ([]CompletionResponse, bool) {
			
 
				-	// Build complete string and find stop position
			
 
				-	var completeStr string
			
 
				-	for _, piece := range pieces {
			
 
				-		completeStr += piece.Content
			
 
				+// truncateStop removes the provided stop string from sequence,
			
 
				+// returning both the truncated sequence and a bool indicating if truncation occurred
			
 
				+func truncateStop(sequence string, stop string) (string, bool) {
			
 
				+	index := strings.Index(sequence, stop)
			
 
				+	if index == -1 {
			
 
				+		return sequence, false
			
 
				 	}
			
 
				 
			
 
				-	stopStart := strings.Index(completeStr, stop)
			
 
				-	if stopStart == -1 {
			
 
				-		return pieces, false
			
 
				-	}
			
 
				-
			
 
				-	// Build result up to stop position
			
 
				-	result := make([]CompletionResponse, 0)
			
 
				-	accumulated := 0
			
 
				-
			
 
				-	truncated := false
			
 
				-	for _, piece := range pieces {
			
 
				-		if accumulated+len(piece.Content) <= stopStart {
			
 
				-			result = append(result, piece)
			
 
				-			accumulated += len(piece.Content)
			
 
				-			continue
			
 
				-		}
			
 
				-
			
 
				-		if accumulated < stopStart {
			
 
				-			truncPiece := piece
			
 
				-			truncPiece.Content = piece.Content[:stopStart-accumulated]
			
 
				-			if len(truncPiece.Content) > 0 {
			
 
				-				result = append(result, truncPiece)
			
 
				-				truncated = true
			
 
				-			}
			
 
				-		}
			
 
				-		break
			
 
				-	}
			
 
				-
			
 
				-	// Signal if we had to truncate the last piece
			
 
				-	return result, truncated
			
 
				+	return sequence[:index], true
			
 
				 }
			
 
				 
			
 
				 func incompleteUnicode(token string) bool {
			
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -1,90 +1,60 @@
 
				 package runner
			
 
				 
			
 
				 import (
			
 
				-	"reflect"
			
 
				 	"testing"
			
 
				 )
			
 
				 
			
 
				 func TestTruncateStop(t *testing.T) {
			
 
				 	tests := []struct {
			
 
				 		name          string
			
 
				-		pieces        []CompletionResponse
			
 
				+		sequence      string
			
 
				 		stop          string
			
 
				-		expected      []CompletionResponse
			
 
				+		expected      string
			
 
				 		expectedTrunc bool
			
 
				 	}{
			
 
				 		{
			
 
				-			name: "Single word",
			
 
				-			pieces: []CompletionResponse{
			
 
				-				{Content: "hello"},
			
 
				-				{Content: "world"},
			
 
				-			},
			
 
				-			stop: "world",
			
 
				-			expected: []CompletionResponse{
			
 
				-				{Content: "hello"},
			
 
				-			},
			
 
				-			expectedTrunc: false,
			
 
				+			name:          "Single word",
			
 
				+			sequence:      "helloworld",
			
 
				+			stop:          "world",
			
 
				+			expected:      "hello",
			
 
				+			expectedTrunc: true,
			
 
				 		},
			
 
				 		{
			
 
				-			name: "Partial",
			
 
				-			pieces: []CompletionResponse{
			
 
				-				{Content: "hello"},
			
 
				-				{Content: "wor"},
			
 
				-			},
			
 
				-			stop: "or",
			
 
				-			expected: []CompletionResponse{
			
 
				-				{Content: "hello"},
			
 
				-				{Content: "w"},
			
 
				-			},
			
 
				+			name:          "Partial",
			
 
				+			sequence:      "hellowor",
			
 
				+			stop:          "or",
			
 
				+			expected:      "hellow",
			
 
				 			expectedTrunc: true,
			
 
				 		},
			
 
				 		{
			
 
				-			name: "Suffix",
			
 
				-			pieces: []CompletionResponse{
			
 
				-				{Content: "Hello"},
			
 
				-				{Content: " there"},
			
 
				-				{Content: "!"},
			
 
				-			},
			
 
				-			stop: "!",
			
 
				-			expected: []CompletionResponse{
			
 
				-				{Content: "Hello"},
			
 
				-				{Content: " there"},
			
 
				-			},
			
 
				-			expectedTrunc: false,
			
 
				+			name:          "Suffix",
			
 
				+			sequence:      "Hello there!",
			
 
				+			stop:          "!",
			
 
				+			expected:      "Hello there",
			
 
				+			expectedTrunc: true,
			
 
				 		},
			
 
				 		{
			
 
				-			name: "Suffix partial",
			
 
				-			pieces: []CompletionResponse{
			
 
				-				{Content: "Hello"},
			
 
				-				{Content: " the"},
			
 
				-				{Content: "re!"},
			
 
				-			},
			
 
				-			stop: "there!",
			
 
				-			expected: []CompletionResponse{
			
 
				-				{Content: "Hello"},
			
 
				-				{Content: " "},
			
 
				-			},
			
 
				+			name:          "Middle",
			
 
				+			sequence:      "hello wor",
			
 
				+			stop:          "llo w",
			
 
				+			expected:      "he",
			
 
				 			expectedTrunc: true,
			
 
				 		},
			
 
				 		{
			
 
				-			name: "Middle",
			
 
				-			pieces: []CompletionResponse{
			
 
				-				{Content: "hello"},
			
 
				-				{Content: " wor"},
			
 
				-			},
			
 
				-			stop: "llo w",
			
 
				-			expected: []CompletionResponse{
			
 
				-				{Content: "he"},
			
 
				-			},
			
 
				-			expectedTrunc: true,
			
 
				+			name:          "No stop found",
			
 
				+			sequence:      "hello world",
			
 
				+			stop:          "xyz",
			
 
				+			expected:      "hello world",
			
 
				+			expectedTrunc: false,
			
 
				 		},
			
 
				 	}
			
 
				 
			
 
				 	for _, tt := range tests {
			
 
				 		t.Run(tt.name, func(t *testing.T) {
			
 
				-			result, resultTrunc := truncateStop(tt.pieces, tt.stop)
			
 
				-			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
			
 
				-				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
			
 
				+			result, truncated := truncateStop(tt.sequence, tt.stop)
			
 
				+			if result != tt.expected || truncated != tt.expectedTrunc {
			
 
				+				t.Errorf("truncateStop(%q, %q): have %q (%v); want %q (%v)",
			
 
				+					tt.sequence, tt.stop, result, truncated, tt.expected, tt.expectedTrunc)
			
 
				 			}
			
 
				 		})
			
 
				 	}
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -644,12 +644,22 @@ type ImageData struct {
 
				 	AspectRatioID int    `json:"aspect_ratio_id"`
			
 
				 }
			
 
				 
			
 
				+// TokenProbs represents probability information for a token
			
 
				+type TokenProbs struct {
			
 
				+	TokenID int     `json:"id"`
			
 
				+	Logit   float32 `json:"logit"`
			
 
				+	Prob    float32 `json:"prob"`
			
 
				+	LogProb float32 `json:"logprob"`
			
 
				+	Token   string  `json:"token"`
			
 
				+}
			
 
				+
			
 
				 type completion struct {
			
 
				-	Content      string `json:"content"`
			
 
				-	Model        string `json:"model"`
			
 
				-	Prompt       string `json:"prompt"`
			
 
				-	Stop         bool   `json:"stop"`
			
 
				-	StoppedLimit bool   `json:"stopped_limit"`
			
 
				+	Content      string       `json:"content"`
			
 
				+	Model        string       `json:"model"`
			
 
				+	Prompt       string       `json:"prompt"`
			
 
				+	Stop         bool         `json:"stop"`
			
 
				+	StoppedLimit bool         `json:"stopped_limit"`
			
 
				+	LogProbs     []TokenProbs `json:"logprobs"`
			
 
				 
			
 
				 	Timings struct {
			
 
				 		PredictedN  int     `json:"predicted_n"`
			
@@ -660,14 +670,16 @@ type completion struct {
 
				 }
			
 
				 
			
 
				 type CompletionRequest struct {
			
 
				-	Prompt  string
			
 
				-	Format  json.RawMessage
			
 
				-	Images  []ImageData
			
 
				-	Options *api.Options
			
 
				+	Prompt   string
			
 
				+	Format   json.RawMessage
			
 
				+	Images   []ImageData
			
 
				+	LogProbs int
			
 
				+	Options  *api.Options
			
 
				 }
			
 
				 
			
 
				 type CompletionResponse struct {
			
 
				 	Content            string
			
 
				+	LogProbs           []TokenProbs
			
 
				 	DoneReason         string
			
 
				 	Done               bool
			
 
				 	PromptEvalCount    int
			
@@ -698,9 +710,12 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
				 		"seed":              req.Options.Seed,
			
 
				 		"stop":              req.Options.Stop,
			
 
				 		"image_data":        req.Images,
			
 
				+		"logprobs":          req.LogProbs,
			
 
				 		"cache_prompt":      true,
			
 
				 	}
			
 
				 
			
 
				+	fmt.Println("completion request:", request)
			
 
				+
			
 
				 	if len(req.Format) > 0 {
			
 
				 		switch string(req.Format) {
			
 
				 		case `null`, `""`:
			
@@ -796,7 +811,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
				 				continue
			
 
				 			}
			
 
				 
			
 
				-			// slog.Debug("got line", "line", string(line))
			
 
				 			evt, ok := bytes.CutPrefix(line, []byte("data: "))
			
 
				 			if !ok {
			
 
				 				evt = line
			
@@ -822,7 +836,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
				 
			
 
				 			if c.Content != "" {
			
 
				 				fn(CompletionResponse{
			
 
				-					Content: c.Content,
			
 
				+					Content:  c.Content,
			
 
				+					LogProbs: c.LogProbs,
			
 
				 				})
			
 
				 			}
			
 
				 
			
@@ -839,6 +854,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
				 					PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
			
 
				 					EvalCount:          c.Timings.PredictedN,
			
 
				 					EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
			
 
				+					LogProbs:           c.LogProbs,
			
 
				 				})
			
 
				 				return nil
			
 
				 			}
			
--- a/server/routes.go
+++ b/server/routes.go
@@ -293,11 +293,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
				 		var sb strings.Builder
			
 
				 		defer close(ch)
			
 
				 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
			
 
				-			Prompt:  prompt,
			
 
				-			Images:  images,
			
 
				-			Format:  req.Format,
			
 
				-			Options: opts,
			
 
				+			Prompt:   prompt,
			
 
				+			Images:   images,
			
 
				+			Format:   req.Format,
			
 
				+			LogProbs: req.LogProbs,
			
 
				+			Options:  opts,
			
 
				 		}, func(cr llm.CompletionResponse) {
			
 
				+			fmt.Printf("banana: %#v\n", cr)
			
 
				 			res := api.GenerateResponse{
			
 
				 				Model:      req.Model,
			
 
				 				CreatedAt:  time.Now().UTC(),
			
@@ -311,6 +313,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
				 					EvalDuration:       cr.EvalDuration,
			
 
				 				},
			
 
				 			}
			
 
				+			for _, p := range cr.LogProbs {
			
 
				+				res.LogProbs = append(res.LogProbs, api.TokenProbs{
			
 
				+					TokenID: p.TokenID,
			
 
				+					LogProb: p.LogProb,
			
 
				+					Token:   p.Token,
			
 
				+				})
			
 
				+			}
			
 
				 
			
 
				 			if _, err := sb.WriteString(cr.Content); err != nil {
			
 
				 				ch <- gin.H{"error": err.Error()}
			
@@ -1466,10 +1475,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
				 		var sb strings.Builder
			
 
				 		var toolCallIndex int = 0
			
 
				 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
			
 
				-			Prompt:  prompt,
			
 
				-			Images:  images,
			
 
				-			Format:  req.Format,
			
 
				-			Options: opts,
			
 
				+			Prompt:   prompt,
			
 
				+			Images:   images,
			
 
				+			Format:   req.Format,
			
 
				+			LogProbs: req.LogProbs,
			
 
				+			Options:  opts,
			
 
				 		}, func(r llm.CompletionResponse) {
			
 
				 			res := api.ChatResponse{
			
 
				 				Model:      req.Model,
			
@@ -1484,6 +1494,13 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
				 					EvalDuration:       r.EvalDuration,
			
 
				 				},
			
 
				 			}
			
 
				+			for _, p := range r.LogProbs {
			
 
				+				res.LogProbs = append(res.LogProbs, api.TokenProbs{
			
 
				+					TokenID: p.TokenID,
			
 
				+					LogProb: p.LogProb,
			
 
				+					Token:   p.Token,
			
 
				+				})
			
 
				+			}
			
 
				 
			
 
				 			if r.Done {
			
 
				 				res.TotalDuration = time.Since(checkpointStart)