Browse Source

add missing file

Patrick Devine 11 months ago
parent
commit
2d315ba9a9
1 changed files with 72 additions and 0 deletions
  1. 72 0
      convert/tokenizer.go

+ 72 - 0
convert/tokenizer.go

@@ -0,0 +1,72 @@
+package convert
+
+import (
+	"encoding/json"
+	"io/ioutil"
+	"os"
+)
+
+type Tokenizer struct {
+	Version     string         `json:"version"`
+	AddedTokens []Token        `json:"added_tokens"`
+	Model       TokenizerModel `json:"model"`
+}
+
+type TokenizerModel struct {
+	Type   string         `json:"type"`
+	Vocab  map[string]int `json:"vocab"`
+	Merges []string       `json:"merges"`
+	Tokens []Token
+}
+
+type Token struct {
+	ID          int    `json:"id"`
+	Content     string `json:"content"`
+	Special     bool   `json:"special"`
+	UserDefined bool
+}
+
+func (t *Tokenizer) getMaxID() int {
+	var maxID int
+	for _, v := range t.Model.Vocab {
+		maxID = max(maxID, v)
+	}
+
+	for _, v := range t.AddedTokens {
+		maxID = max(maxID, v.ID)
+	}
+	return maxID
+}
+
+func newTokenizer(dirpath string) (*Tokenizer, error) {
+	f, err := os.Open(dirpath)
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+
+	data, err := ioutil.ReadAll(f)
+	if err != nil {
+		return nil, err
+	}
+
+	var tdata Tokenizer
+
+	if err := json.Unmarshal(data, &tdata); err != nil {
+		return nil, err
+	}
+
+	maxID := tdata.getMaxID()
+	tdata.Model.Tokens = make([]Token, maxID+1)
+
+	for k, v := range tdata.Model.Vocab {
+		tdata.Model.Tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false}
+	}
+
+	for _, v := range tdata.AddedTokens {
+		v.UserDefined = true
+		tdata.Model.Tokens[v.ID] = v
+	}
+
+	return &tdata, nil
+}