8 months ago · 60e47573a6
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -79,6 +79,118 @@ func TestParseTokenizer(t *testing.T) {
 
				 				Template:   "<default template>",
			
 
				 			},
			
 
				 		},
			
 
				+		{
			
 
				+			name: "added tokens",
			
 
				+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
			
 
				+				"tokenizer.json": strings.NewReader(`{
			
 
				+					"added_tokens": [
			
 
				+						{
			
 
				+							"id": 999,
			
 
				+							"content": "<unused999>",
			
 
				+							"special": false
			
 
				+						}
			
 
				+					]
			
 
				+				}`),
			
 
				+			}),
			
 
				+			want: &Tokenizer{
			
 
				+				Vocabulary: &Vocabulary{
			
 
				+					Model:  "gpt2",
			
 
				+					Tokens: []string{"<unused999>"},
			
 
				+					Scores: []float32{999},
			
 
				+					Types:  []int32{4},
			
 
				+				},
			
 
				+				Pre: "default",
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name: "added tokens overlap vocab",
			
 
				+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
			
 
				+				"tokenizer.json": strings.NewReader(`{
			
 
				+					"added_tokens": [
			
 
				+						{
			
 
				+							"id": 0,
			
 
				+							"content": "<pad>",
			
 
				+							"special": true
			
 
				+						}
			
 
				+					],
			
 
				+					"model": {
			
 
				+						"vocab": {
			
 
				+							"<pad>": 0
			
 
				+						}
			
 
				+					}
			
 
				+				}`),
			
 
				+			}),
			
 
				+			want: &Tokenizer{
			
 
				+				Vocabulary: &Vocabulary{
			
 
				+					Model:  "gpt2",
			
 
				+					Tokens: []string{"<pad>"},
			
 
				+					Scores: []float32{0},
			
 
				+					Types:  []int32{3},
			
 
				+				},
			
 
				+				Pre: "default",
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name: "special token types",
			
 
				+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
			
 
				+				"tokenizer.json": strings.NewReader(`{
			
 
				+					"added_tokens": [
			
 
				+						{
			
 
				+							"id": 0,
			
 
				+							"content": "<pad>",
			
 
				+							"special": true
			
 
				+						},
			
 
				+						{
			
 
				+							"id": 1,
			
 
				+							"content": "<eos>",
			
 
				+							"special": true
			
 
				+						},
			
 
				+						{
			
 
				+							"id": 2,
			
 
				+							"content": "<bos>",
			
 
				+							"special": true
			
 
				+						},
			
 
				+						{
			
 
				+							"id": 3,
			
 
				+							"content": "<unk>",
			
 
				+							"special": true
			
 
				+						}
			
 
				+					],
			
 
				+					"model": {
			
 
				+						"vocab": {
			
 
				+							"<pad>": 0,
			
 
				+							"<eos>": 1,
			
 
				+							"<bos>": 2,
			
 
				+							"<unk>": 3
			
 
				+						}
			
 
				+					}
			
 
				+				}`),
			
 
				+				"tokenizer_config.json": strings.NewReader(`{
			
 
				+					"add_bos_token": true,
			
 
				+					"add_eos_token": false,
			
 
				+					"bos_token": "<bos>",
			
 
				+					"eos_token": "<eos>",
			
 
				+					"pad_token": "<pad>",
			
 
				+					"unk_token": "<unk>"
			
 
				+				}`),
			
 
				+			}),
			
 
				+			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
			
 
				+			want: &Tokenizer{
			
 
				+				Vocabulary: &Vocabulary{
			
 
				+					Model:  "gpt2",
			
 
				+					Tokens: []string{"<pad>", "<eos>", "<bos>", "<unk>"},
			
 
				+					Scores: []float32{0, 1, 2, 3},
			
 
				+					Types:  []int32{3, 3, 3, 3},
			
 
				+				},
			
 
				+				SpecialVocabulary: []*SpecialVocabulary{
			
 
				+					{Type: "pad", Content: "<pad>", ID: 0, AddToken: false},
			
 
				+					{Type: "eos", Content: "<eos>", ID: 1, AddToken: false},
			
 
				+					{Type: "bos", Content: "<bos>", ID: 2, AddToken: true},
			
 
				+					{Type: "unk", Content: "<unk>", ID: 3, AddToken: false},
			
 
				+				},
			
 
				+				Pre: "default",
			
 
				+			},
			
 
				+		},
			
 
				 	}
			
 
				 
			
 
				 	for _, tt := range cases {