Browse Source

optional parameter to not stream response (#639)

* update streaming request accept header
* add optional stream param to request bodies
Bruce MacDonald 1 year ago
parent
commit
274d5a5fdf
4 changed files with 94 additions and 18 deletions
  1. 5 1
      api/client.go
  2. 11 3
      api/types.go
  3. 21 14
      docs/api.md
  4. 57 0
      server/routes.go

+ 5 - 1
api/client.go

@@ -17,6 +17,10 @@ import (
 	"github.com/jmorganca/ollama/version"
 )
 
+const DefaultHost = "127.0.0.1:11434"
+
+var envHost = os.Getenv("OLLAMA_HOST")
+
 type Client struct {
 	base *url.URL
 	http http.Client
@@ -143,7 +147,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}
 
 	request.Header.Set("Content-Type", "application/json")
-	request.Header.Set("Accept", "application/json")
+	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 
 	response, err := c.http.Do(request)

+ 11 - 3
api/types.go

@@ -37,6 +37,7 @@ type GenerateRequest struct {
 	System   string `json:"system"`
 	Template string `json:"template"`
 	Context  []int  `json:"context,omitempty"`
+	Stream   *bool  `json:"stream,omitempty"`
 
 	Options map[string]interface{} `json:"options"`
 }
@@ -53,8 +54,9 @@ type EmbeddingResponse struct {
 }
 
 type CreateRequest struct {
-	Name string `json:"name"`
-	Path string `json:"path"`
+	Name   string `json:"name"`
+	Path   string `json:"path"`
+	Stream *bool  `json:"stream,omitempty"`
 }
 
 type DeleteRequest struct {
@@ -81,6 +83,9 @@ type CopyRequest struct {
 type PullRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
+	Username string `json:"username"`
+	Password string `json:"password"`
+	Stream   *bool  `json:"stream,omitempty"`
 }
 
 type ProgressResponse struct {
@@ -93,6 +98,9 @@ type ProgressResponse struct {
 type PushRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
+	Username string `json:"username"`
+	Password string `json:"password"`
+	Stream   *bool  `json:"stream,omitempty"`
 }
 
 type ListResponse struct {
@@ -113,7 +121,7 @@ type TokenResponse struct {
 type GenerateResponse struct {
 	Model     string    `json:"model"`
 	CreatedAt time.Time `json:"created_at"`
-	Response  string    `json:"response,omitempty"`
+	Response  string    `json:"response"`
 
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`

+ 21 - 14
docs/api.md

@@ -12,7 +12,6 @@
 - [Push a Model](#push-a-model)
 - [Generate Embeddings](#generate-embeddings)
 
-
 ## Conventions
 
 ### Model names
@@ -40,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
 
-Advanced parameters:
+Advanced parameters (optional):
 
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system prompt to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
+- `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects
 
 ### Request
 
@@ -80,6 +80,7 @@ The final response in the stream also includes additional data about the generat
 - `eval_count`: number of tokens the response
 - `eval_duration`: time in nanoseconds spent generating the response
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
+- `response`: empty if the response was streamed, if not streamed, this will contain the full response
 
 To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
 
@@ -87,6 +88,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 {
   "model": "llama2:7b",
   "created_at": "2023-08-04T19:22:45.499127Z",
+  "response": "",
   "context": [1, 2, 3],
   "done": true,
   "total_duration": 5589157167,
@@ -112,6 +114,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
 
 - `name`: name of the model to create
 - `path`: path to the Modelfile
+- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
 
 ### Request
 
@@ -179,7 +182,7 @@ Show details about a model including modelfile, template, parameters, license, a
 
 ### Request
 
-```shell  
+```shell
 curl http://localhost:11434/api/show -d '{
   "name": "llama2:7b"
 }'
@@ -189,10 +192,10 @@ curl http://localhost:11434/api/show -d '{
 
 ```json
 {
-    "license": "<contents of license block>",
-    "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
-    "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
-    "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
+  "license": "<contents of license block>",
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
+  "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
+  "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
 }
 ```
 
@@ -245,6 +248,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 
 - `name`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
+- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
 
 ### Request
 
@@ -275,7 +279,8 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 ### Parameters
 
 - `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
-- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.  
+- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
+- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
 
 ### Request
 
@@ -290,15 +295,16 @@ curl -X POST http://localhost:11434/api/push -d '{
 Streaming response that starts with:
 
 ```json
-{"status":"retrieving manifest"}
+{ "status": "retrieving manifest" }
 ```
 
 and then:
 
 ```json
 {
-"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
-"total":1928429856
+  "status": "starting upload",
+  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "total": 1928429856
 }
 ```
 
@@ -306,9 +312,10 @@ Then there is a series of uploading responses:
 
 ```json
 {
-"status":"starting upload",
-"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
-"total":1928429856}
+  "status": "starting upload",
+  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "total": 1928429856
+}
 ```
 
 Finally, when the upload is complete:

+ 57 - 0
server/routes.go

@@ -240,6 +240,23 @@ func GenerateHandler(c *gin.Context) {
 		}
 	}()
 
+	if req.Stream != nil && !*req.Stream {
+		var response api.GenerateResponse
+		generated := ""
+		for resp := range ch {
+			if r, ok := resp.(api.GenerateResponse); ok {
+				generated += r.Response
+				response = r
+			} else {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+				return
+			}
+		}
+		response.Response = generated
+		c.JSON(http.StatusOK, response)
+		return
+	}
+
 	streamResponse(c, ch)
 }
 
@@ -309,6 +326,11 @@ func PullModelHandler(c *gin.Context) {
 		}
 	}()
 
+	if req.Stream != nil && !*req.Stream {
+		waitForStream(c, ch)
+		return
+	}
+
 	streamResponse(c, ch)
 }
 
@@ -336,6 +358,11 @@ func PushModelHandler(c *gin.Context) {
 		}
 	}()
 
+	if req.Stream != nil && !*req.Stream {
+		waitForStream(c, ch)
+		return
+	}
+
 	streamResponse(c, ch)
 }
 
@@ -363,6 +390,11 @@ func CreateModelHandler(c *gin.Context) {
 		}
 	}()
 
+	if req.Stream != nil && !*req.Stream {
+		waitForStream(c, ch)
+		return
+	}
+
 	streamResponse(c, ch)
 }
 
@@ -603,6 +635,31 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	return s.Serve(ln)
 }
 
+func waitForStream(c *gin.Context, ch chan interface{}) {
+	c.Header("Content-Type", "application/json")
+	for resp := range ch {
+		switch r := resp.(type) {
+		case api.ProgressResponse:
+			if r.Status == "success" {
+				c.JSON(http.StatusOK, r)
+				return
+			}
+		case gin.H:
+			if errorMsg, ok := r["error"].(string); ok {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
+				return
+			} else {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
+				return
+			}
+		default:
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
+			return
+		}
+	}
+	c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
+}
+
 func streamResponse(c *gin.Context, ch chan any) {
 	c.Header("Content-Type", "application/x-ndjson")
 	c.Stream(func(w io.Writer) bool {