Przeglądaj źródła

JSON mode: add `"format" as an api parameter (#1051)

* add `"format": "json"` as an API parameter
---------
Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
Jeffrey Morgan 1 rok temu
rodzic
commit
5cba29b9d6
5 zmienionych plików z 97 dodań i 9 usunięć
  1. 1 0
      api/types.go
  2. 58 6
      docs/api.md
  3. 33 1
      llm/llama.go
  4. 1 1
      llm/llm.go
  5. 4 1
      server/routes.go

+ 1 - 0
api/types.go

@@ -38,6 +38,7 @@ type GenerateRequest struct {
 	Context  []int  `json:"context,omitempty"`
 	Context  []int  `json:"context,omitempty"`
 	Stream   *bool  `json:"stream,omitempty"`
 	Stream   *bool  `json:"stream,omitempty"`
 	Raw      bool   `json:"raw,omitempty"`
 	Raw      bool   `json:"raw,omitempty"`
+	Format   string `json:"format"`
 
 
 	Options map[string]interface{} `json:"options"`
 	Options map[string]interface{} `json:"options"`
 }
 }

+ 58 - 6
docs/api.md

@@ -38,6 +38,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 
 
 - `model`: (required) the [model name](#model-names)
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
 - `prompt`: the prompt to generate a response for
+- `format`: the format to return a response in. Currently the only accepted value is `json`
 
 
 Advanced parameters (optional):
 Advanced parameters (optional):
 
 
@@ -48,13 +49,17 @@ Advanced parameters (optional):
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
 - `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
 
 
+### JSON mode
+
+Enable JSON mode by setting the `format` parameter to `json` and specifying the model should use JSON in the `prompt`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
+
 ### Examples
 ### Examples
 
 
 #### Request
 #### Request
 
 
 ```shell
 ```shell
 curl -X POST http://localhost:11434/api/generate -d '{
 curl -X POST http://localhost:11434/api/generate -d '{
-  "model": "llama2:7b",
+  "model": "llama2",
   "prompt": "Why is the sky blue?"
   "prompt": "Why is the sky blue?"
 }'
 }'
 ```
 ```
@@ -65,7 +70,7 @@ A stream of JSON objects is returned:
 
 
 ```json
 ```json
 {
 {
-  "model": "llama2:7b",
+  "model": "llama2",
   "created_at": "2023-08-04T08:52:19.385406455-07:00",
   "created_at": "2023-08-04T08:52:19.385406455-07:00",
   "response": "The",
   "response": "The",
   "done": false
   "done": false
@@ -89,7 +94,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 
 
 ```json
 ```json
 {
 {
-  "model": "llama2:7b",
+  "model": "llama2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "response": "",
   "response": "",
   "context": [1, 2, 3],
   "context": [1, 2, 3],
@@ -105,7 +110,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 }
 }
 ```
 ```
 
 
-#### Request
+#### Request (No streaming)
 
 
 ```shell
 ```shell
 curl -X POST http://localhost:11434/api/generate -d '{
 curl -X POST http://localhost:11434/api/generate -d '{
@@ -137,7 +142,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 }
 ```
 ```
 
 
-#### Request
+#### Request (Raw mode)
 
 
 In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
 In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
 
 
@@ -167,7 +172,54 @@ curl -X POST http://localhost:11434/api/generate -d '{
 }
 }
 ```
 ```
 
 
-#### Request
+#### Request (JSON mode)
+
+```shell
+curl -X POST http://localhost:11434/api/generate -d '{
+  "model": "llama2",
+  "prompt": "What color is the sky at different times of the day? Respond using JSON",
+  "format": "json",
+  "stream": false
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "llama2",
+  "created_at": "2023-11-09T21:07:55.186497Z",
+  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
+  "done": true,
+  "total_duration": 4661289125,
+  "load_duration": 1714434500,
+  "prompt_eval_count": 36,
+  "prompt_eval_duration": 264132000,
+  "eval_count": 75,
+  "eval_duration": 2112149000
+}
+```
+
+The value of `response` will be a string containing JSON similar to:
+
+```json
+{
+  "morning": {
+    "color": "blue"
+  },
+  "noon": {
+    "color": "blue-gray"
+  },
+  "afternoon": {
+    "color": "warm gray"
+  },
+  "evening": {
+    "color": "orange"
+  }
+}
+```
+
+#### Request (With options)
 
 
 If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
 If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
 
 

+ 33 - 1
llm/llama.go

@@ -27,6 +27,34 @@ import (
 	"github.com/jmorganca/ollama/format"
 	"github.com/jmorganca/ollama/format"
 )
 )
 
 
+const jsonGrammar = `
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+
+string ::=
+  "\"" (
+    [^"\\] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+
+# Optional space: by convention, applied in this grammar after literal chars when allowed
+ws ::= ([ \t\n] ws)?
+`
+
 //go:embed llama.cpp/*/build/*/bin/*
 //go:embed llama.cpp/*/build/*/bin/*
 var llamaCppEmbed embed.FS
 var llamaCppEmbed embed.FS
 
 
@@ -497,7 +525,7 @@ type prediction struct {
 
 
 const maxBufferSize = 512 * format.KiloByte
 const maxBufferSize = 512 * format.KiloByte
 
 
-func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
+func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, format string, fn func(api.GenerateResponse)) error {
 	prevConvo, err := llm.Decode(ctx, prevContext)
 	prevConvo, err := llm.Decode(ctx, prevContext)
 	if err != nil {
 	if err != nil {
 		return err
 		return err
@@ -532,6 +560,10 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 		"stop":              llm.Stop,
 		"stop":              llm.Stop,
 	}
 	}
 
 
+	if format == "json" {
+		request["grammar"] = jsonGrammar
+	}
+
 	// Handling JSON marshaling with special characters unescaped.
 	// Handling JSON marshaling with special characters unescaped.
 	buffer := &bytes.Buffer{}
 	buffer := &bytes.Buffer{}
 	enc := json.NewEncoder(buffer)
 	enc := json.NewEncoder(buffer)

+ 1 - 1
llm/llm.go

@@ -14,7 +14,7 @@ import (
 )
 )
 
 
 type LLM interface {
 type LLM interface {
-	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
+	Predict(context.Context, []int, string, string, func(api.GenerateResponse)) error
 	Embedding(context.Context, string) ([]float64, error)
 	Embedding(context.Context, string) ([]float64, error)
 	Encode(context.Context, string) ([]int, error)
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
 	Decode(context.Context, []int) (string, error)

+ 4 - 1
server/routes.go

@@ -163,6 +163,9 @@ func GenerateHandler(c *gin.Context) {
 	case req.Model == "":
 	case req.Model == "":
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 		return
+	case len(req.Format) > 0 && req.Format != "json":
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"})
+		return
 	case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0):
 	case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
 		return
 		return
@@ -231,7 +234,7 @@ func GenerateHandler(c *gin.Context) {
 			ch <- r
 			ch <- r
 		}
 		}
 
 
-		if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
+		if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, req.Format, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 			ch <- gin.H{"error": err.Error()}
 		}
 		}
 	}()
 	}()