4 mēneši atpakaļ · c92d418a7c
--- a/api/types.go
+++ b/api/types.go
@@ -80,6 +80,8 @@ type GenerateRequest struct {
 
															 	// Options lists model-specific options. For example, temperature can be
														
 
															 	// set through this field, if the model supports it.
														
 
															 	Options map[string]interface{} `json:"options"`
														
 
															+
														
 
															+	ReturnLogits bool `json:"return_logits,omitempty"`
														
 
															 }
														
 
															 // ChatRequest describes a request sent by [Client.Chat].
														
@@ -105,6 +107,8 @@ type ChatRequest struct {
 
															 	// Options lists model-specific options.
														
 
															 	Options map[string]interface{} `json:"options"`
														
 
															+
														
 
															+	ReturnLogits bool `json:"return_logits,omitempty"`
														
 
															 }
														
 
															 type Tools []Tool
														
@@ -189,6 +193,7 @@ type ChatResponse struct {
 
															 	CreatedAt  time.Time `json:"created_at"`
														
 
															 	Message    Message   `json:"message"`
														
 
															 	DoneReason string    `json:"done_reason,omitempty"`
														
 
															+	Logits     []float32 `json:"logits"`
														
 
															 	Done bool `json:"done"`
														
@@ -204,6 +209,15 @@ type Metrics struct {
 
															 	EvalDuration       time.Duration `json:"eval_duration,omitempty"`
														
 
															 }
														
 
															+type TokenLogprob struct {
														
 
															+	Token   string  `json:"token"`
														
 
															+	Logprob float32 `json:"logprob"`
														
 
															+}
														
 
															+
														
 
															+type LogProbs struct {
														
 
															+	TopLogprobs []TokenLogprob `json:"top_logprobs"`
														
 
															+}
														
 
															+
														
 
															 // Options specified in [GenerateRequest].  If you add a new option here, also
														
 
															 // add it to the API docs.
														
 
															 type Options struct {
														
@@ -450,6 +464,8 @@ type GenerateResponse struct {
 
															 	Context []int `json:"context,omitempty"`
														
 
															 	Metrics
														
 
															+
														
 
															+	Logits []float32 `json:"logits"`
														
 
															 }
														
 
															 // ModelDetails provides details about a model.
														
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -260,6 +260,19 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
 
															 	return unsafe.Slice((*float32)(embeddings), c.Model().NEmbd())
														
 
															 }
														
 
															+// GetLogits returns the logits from the last decode operation.
														
 
															+// The returned slice has length equal to the vocabulary size.
														
 
															+func (c *Context) GetLogits() []float32 {
														
 
															+	logits := unsafe.Pointer(C.llama_get_logits(c.c))
														
 
															+	if logits == nil {
														
 
															+		return nil
														
 
															+	}
														
 
															+
														
 
															+	// Get the number of vocabulary tokens to determine array size
														
 
															+	vocabSize := c.Model().NumVocab()
														
 
															+	return unsafe.Slice((*float32)(logits), vocabSize)
														
 
															+}
														
 
															+
														
 
															 type ModelParams struct {
														
 
															 	NumGpuLayers int
														
 
															 	MainGpu      int
														
@@ -737,14 +750,3 @@ func SchemaToGrammar(schema []byte) []byte {
 
															 	}
														
 
															 	return buf[:n]
														
 
															 }
														
 
															-
														
 
															-// GetLogits returns the logits from the last decode operation.
														
 
															-// The returned slice has length equal to the vocabulary size.
														
 
															-func (c *Context) GetLogits() []float32 {
														
 
															-	logits := unsafe.Pointer(C.llama_get_logits(c.c))
														
 
															-	if logits == nil {
														
 
															-		return nil
														
 
															-	}
														
 
															-
														
 
															-	// Get the number of vocabulary tokens to determine array size
														
 
															-	vocabSize := c.Model().NumVocab()
														
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -8,12 +8,14 @@ import (
 
															 	"fmt"
														
 
															 	"log"
														
 
															 	"log/slog"
														
 
															+	"math"
														
 
															 	"net"
														
 
															 	"net/http"
														
 
															 	"os"
														
 
															 	"path/filepath"
														
 
															 	"regexp"
														
 
															 	"runtime"
														
 
															+	"sort"
														
 
															 	"strconv"
														
 
															 	"strings"
														
 
															 	"sync"
														
@@ -59,7 +61,7 @@ type Sequence struct {
 
															 	crossAttention bool
														
 
															 	// channel to send responses over
														
 
															-	responses chan string
														
 
															+	responses chan CompletionResponse
														
 
															 	// channel to stop decoding (such as if the remote connection is closed)
														
 
															 	quit chan bool
														
@@ -88,6 +90,15 @@ type Sequence struct {
 
															 	startGenerationTime time.Time
														
 
															 	numDecoded          int
														
 
															 	numPromptInputs     int
														
 
															+
														
 
															+	// New flag we need to add to Sequence struct
														
 
															+	returnLogits bool
														
 
															+
														
 
															+	// Using our new GetLogits() method
														
 
															+	logits []float32
														
 
															+
														
 
															+	// Add new channel for logits
														
 
															+	logitsOut chan []float32
														
 
															 }
														
 
															 type NewSequenceParams struct {
														
@@ -96,6 +107,7 @@ type NewSequenceParams struct {
 
															 	numKeep        int
														
 
															 	samplingParams *llama.SamplingParams
														
 
															 	embedding      bool
														
 
															+	returnLogits   bool
														
 
															 }
														
 
															 func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
														
@@ -149,13 +161,15 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 
															 		startProcessingTime: startTime,
														
 
															 		numPredict:          params.numPredict,
														
 
															 		pendingResponses:    make([]string, 0),
														
 
															-		responses:           make(chan string, 100),
														
 
															+		responses:           make(chan CompletionResponse, 100),
														
 
															 		quit:                make(chan bool, 1),
														
 
															 		embedding:           make(chan []float32, 1),
														
 
															 		samplingCtx:         sc,
														
 
															 		embeddingOnly:       params.embedding,
														
 
															 		stop:                params.stop,
														
 
															 		numKeep:             params.numKeep,
														
 
															+		returnLogits:        params.returnLogits,
														
 
															+		logitsOut:           make(chan []float32, 100),
														
 
															 	}, nil
														
 
															 }
														
@@ -274,25 +288,36 @@ func (s *Server) allNil() bool {
 
															 }
														
 
															 func flushPending(seq *Sequence) bool {
														
 
															-	joined := strings.Join(seq.pendingResponses, "")
														
 
															-	seq.pendingResponses = []string{}
														
 
															+	if len(seq.pendingResponses) == 0 {
														
 
															+		return true
														
 
															+	}
														
 
															+	content := strings.Join(seq.pendingResponses, "")
														
 
															 	// Check if there are any partial UTF-8 characters remaining.
														
 
															 	// We already check and queue as we are generating but some may
														
 
															 	// still make it here:
														
 
															 	// - Sequence is ending, e.g. generation limit has been hit
														
 
															 	// - Invalid characters in the middle of a string
														
 
															 	// This is a stricter check to ensure we never output invalid Unicode.
														
 
															-	for !utf8.ValidString(joined) {
														
 
															-		joined = joined[:len(joined)-1]
														
 
															+	for !utf8.ValidString(content) {
														
 
															+		content = content[:len(content)-1]
														
 
															 	}
														
 
															+	seq.pendingResponses = nil
														
 
															-	if len(joined) == 0 {
														
 
															-		return true
														
 
															+	resp := CompletionResponse{
														
 
															+		Content: content,
														
 
															 	}
														
 
															+	// Add logits if requested and available
														
 
															+	if seq.returnLogits && seq.logits != nil {
														
 
															+		slog.Info("returning logits - flushPending")
														
 
															+		resp.Logits = seq.logits
														
 
															+		seq.logits = nil
														
 
															+	}
														
 
															+
														
 
															+	slog.Info("returning logits - flushPending", "logits", resp.Logits[0])
														
 
															 	select {
														
 
															-	case seq.responses <- joined:
														
 
															+	case seq.responses <- resp:
														
 
															 		return true
														
 
															 	case <-seq.quit:
														
 
															 		return false
														
@@ -476,7 +501,14 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 
															 			continue
														
 
															 		}
														
 
															-		// sample a token
														
 
															+		// Before sampling:
														
 
															+		if seq.returnLogits { // New flag we need to add to Sequence struct
														
 
															+			slog.Info("returning logits")
														
 
															+			seq.logits = s.lc.GetLogits() // Using our new GetLogits() method
														
 
															+
														
 
															+		}
														
 
															+
														
 
															+		// Then sample token
														
 
															 		token := seq.samplingCtx.Sample(s.lc, seq.iBatch)
														
 
															 		seq.samplingCtx.Accept(token, true)
														
 
															 		piece := s.model.TokenToPiece(token)
														
@@ -572,10 +604,11 @@ type ImageData struct {
 
															 }
														
 
															 type CompletionRequest struct {
														
 
															-	Prompt      string      `json:"prompt"`
														
 
															-	Images      []ImageData `json:"image_data"`
														
 
															-	Grammar     string      `json:"grammar"`
														
 
															-	CachePrompt bool        `json:"cache_prompt"`
														
 
															+	Prompt       string      `json:"prompt"`
														
 
															+	Images       []ImageData `json:"image_data"`
														
 
															+	Grammar      string      `json:"grammar"`
														
 
															+	CachePrompt  bool        `json:"cache_prompt"`
														
 
															+	ReturnLogits bool        `json:"return_logits"`
														
 
															 	Options
														
 
															 }
														
@@ -588,8 +621,10 @@ type Timings struct {
 
															 }
														
 
															 type CompletionResponse struct {
														
 
															-	Content string `json:"content"`
														
 
															-	Stop    bool   `json:"stop"`
														
 
															+	Content string    `json:"content"`
														
 
															+	Logits  []float32 `json:"logits,omitempty"`
														
 
															+	Tokens  []string  `json:"tokens,omitempty"`
														
 
															+	Stop    bool      `json:"stop"`
														
 
															 	Model        string  `json:"model,omitempty"`
														
 
															 	Prompt       string  `json:"prompt,omitempty"`
														
@@ -637,12 +672,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
															 	samplingParams.Seed = uint32(req.Seed)
														
 
															 	samplingParams.Grammar = req.Grammar
														
 
															+	slog.Info("completion request", "return_logits", req.ReturnLogits)
														
 
															 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
														
 
															 		numPredict:     req.NumPredict,
														
 
															 		stop:           req.Stop,
														
 
															 		numKeep:        req.NumKeep,
														
 
															 		samplingParams: &samplingParams,
														
 
															 		embedding:      false,
														
 
															+		returnLogits:   req.ReturnLogits,
														
 
															 	})
														
 
															 	if err != nil {
														
 
															 		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
														
@@ -691,10 +728,10 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
															 			close(seq.quit)
														
 
															 			return
														
 
															 		case content, ok := <-seq.responses:
														
 
															+			slog.Info("logits in last chan", "content", content.Logits[0])
														
 
															 			if ok {
														
 
															-				if err := json.NewEncoder(w).Encode(&CompletionResponse{
														
 
															-					Content: content,
														
 
															-				}); err != nil {
														
 
															+				slog.Info("content", "content", content.Content)
														
 
															+				if err := json.NewEncoder(w).Encode(&content); err != nil {
														
 
															 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
														
 
															 					close(seq.quit)
														
 
															 					return
														
--- a/llm/server.go
+++ b/llm/server.go
@@ -642,11 +642,12 @@ type ImageData struct {
 
															 }
														
 
															 type completion struct {
														
 
															-	Content      string `json:"content"`
														
 
															-	Model        string `json:"model"`
														
 
															-	Prompt       string `json:"prompt"`
														
 
															-	Stop         bool   `json:"stop"`
														
 
															-	StoppedLimit bool   `json:"stopped_limit"`
														
 
															+	Content      string    `json:"content"`
														
 
															+	Model        string    `json:"model"`
														
 
															+	Prompt       string    `json:"prompt"`
														
 
															+	Stop         bool      `json:"stop"`
														
 
															+	StoppedLimit bool      `json:"stopped_limit"`
														
 
															+	Logits       []float32 `json:"logits,omitempty"`
														
 
															 	Timings struct {
														
 
															 		PredictedN  int     `json:"predicted_n"`
														
@@ -657,10 +658,11 @@ type completion struct {
 
															 }
														
 
															 type CompletionRequest struct {
														
 
															-	Prompt  string
														
 
															-	Format  json.RawMessage
														
 
															-	Images  []ImageData
														
 
															-	Options *api.Options
														
 
															+	Prompt       string
														
 
															+	Format       json.RawMessage
														
 
															+	Images       []ImageData
														
 
															+	Options      *api.Options
														
 
															+	ReturnLogits bool
														
 
															 }
														
 
															 type CompletionResponse struct {
														
@@ -671,6 +673,7 @@ type CompletionResponse struct {
 
															 	PromptEvalDuration time.Duration
														
 
															 	EvalCount          int
														
 
															 	EvalDuration       time.Duration
														
 
															+	Logits             []float32
														
 
															 }
														
 
															 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
														
@@ -696,6 +699,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
															 		"seed":              req.Options.Seed,
														
 
															 		"stop":              req.Options.Stop,
														
 
															 		"image_data":        req.Images,
														
 
															+		"return_logits":     req.ReturnLogits,
														
 
															 		"cache_prompt":      true,
														
 
															 	}
														
@@ -821,6 +825,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
															 			if c.Content != "" {
														
 
															 				fn(CompletionResponse{
														
 
															 					Content: c.Content,
														
 
															+					Logits:  c.Logits,
														
 
															 				})
														
 
															 			}
														
@@ -837,6 +842,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
															 					PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
														
 
															 					EvalCount:          c.Timings.PredictedN,
														
 
															 					EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
														
 
															+					Logits:             c.Logits,
														
 
															 				})
														
 
															 				return nil
														
 
															 			}
														
--- a/server/routes.go
+++ b/server/routes.go
@@ -295,10 +295,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
															 		var sb strings.Builder
														
 
															 		defer close(ch)
														
 
															 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
														
 
															-			Prompt:  prompt,
														
 
															-			Images:  images,
														
 
															-			Format:  req.Format,
														
 
															-			Options: opts,
														
 
															+			Prompt:       prompt,
														
 
															+			Images:       images,
														
 
															+			Format:       req.Format,
														
 
															+			Options:      opts,
														
 
															+			ReturnLogits: req.ReturnLogits,
														
 
															 		}, func(cr llm.CompletionResponse) {
														
 
															 			res := api.GenerateResponse{
														
 
															 				Model:      req.Model,
														
@@ -312,6 +313,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
															 					EvalCount:          cr.EvalCount,
														
 
															 					EvalDuration:       cr.EvalDuration,
														
 
															 				},
														
 
															+				Logits: cr.Logits,
														
 
															 			}
														
 
															 			if _, err := sb.WriteString(cr.Content); err != nil {
														
@@ -1541,16 +1543,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
															 	slog.Debug("chat request", "images", len(images), "prompt", prompt)
														
 
															+	slog.Info("chat request", "return_logits", req.ReturnLogits)
														
 
															+
														
 
															 	ch := make(chan any)
														
 
															 	go func() {
														
 
															 		defer close(ch)
														
 
															 		var sb strings.Builder
														
 
															 		var toolCallIndex int = 0
														
 
															 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
														
 
															-			Prompt:  prompt,
														
 
															-			Images:  images,
														
 
															-			Format:  req.Format,
														
 
															-			Options: opts,
														
 
															+			Prompt:       prompt,
														
 
															+			Images:       images,
														
 
															+			Format:       req.Format,
														
 
															+			Options:      opts,
														
 
															+			ReturnLogits: true,
														
 
															 		}, func(r llm.CompletionResponse) {
														
 
															 			res := api.ChatResponse{
														
 
															 				Model:      req.Model,
														
@@ -1558,6 +1563,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
															 				Message:    api.Message{Role: "assistant", Content: r.Content},
														
 
															 				Done:       r.Done,
														
 
															 				DoneReason: r.DoneReason,
														
 
															+				Logits:     r.Logits,
														
 
															 				Metrics: api.Metrics{
														
 
															 					PromptEvalCount:    r.PromptEvalCount,
														
 
															 					PromptEvalDuration: r.PromptEvalDuration,