Browse Source

llm: set done reason at server level

No functional change. Many different done reasons can be set at the runner
level, so rather than obsuring them we should return them to the server
process and let it choose what to do with the done reason. This separates
the API concerns from the runner.
Bruce MacDonald 1 month ago
parent
commit
22f2f6e229
3 changed files with 10 additions and 12 deletions
  1. 8 0
      llm/server.go
  2. 1 6
      runner/llamarunner/runner.go
  3. 1 6
      runner/ollamarunner/runner.go

+ 8 - 0
llm/server.go

@@ -796,6 +796,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			if err := json.Unmarshal(evt, &c); err != nil {
 				return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
 			}
+			// convert internal done reason to one of our standard api format done reasons
+			switch c.DoneReason {
+			case "limit":
+				c.DoneReason = "length"
+			default:
+				c.DoneReason = "stop"
+			}
+
 			switch {
 			case strings.TrimSpace(c.Content) == lastToken:
 				tokenRepeat++

+ 1 - 6
runner/llamarunner/runner.go

@@ -647,14 +647,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
 				flusher.Flush()
 			} else {
-				// Send the final response
-				doneReason := "stop"
-				if seq.doneReason == "limit" {
-					doneReason = "length"
-				}
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
 					Done:               true,
-					DoneReason:         doneReason,
+					DoneReason:         seq.doneReason,
 					PromptEvalCount:    seq.numPromptInputs,
 					PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
 					EvalCount:          seq.numDecoded,

+ 1 - 6
runner/ollamarunner/runner.go

@@ -627,14 +627,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
 				flusher.Flush()
 			} else {
-				// Send the final response
-				doneReason := "stop"
-				if seq.doneReason == "limit" {
-					doneReason = "length"
-				}
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
 					Done:               true,
-					DoneReason:         doneReason,
+					DoneReason:         seq.doneReason,
 					PromptEvalCount:    seq.numPromptInputs,
 					PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
 					EvalCount:          seq.numPredicted,