tokenizer.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. package convert
  2. import (
  3. "crypto/sha256"
  4. "encoding/hex"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "io/fs"
  9. "log/slog"
  10. "os"
  11. "slices"
  12. "strings"
  13. "golang.org/x/exp/maps"
  14. )
  15. const (
  16. _ int32 = iota
  17. tokenTypeNormal
  18. tokenTypeUnknown
  19. tokenTypeControl
  20. tokenTypeUserDefined
  21. tokenTypeUnused
  22. tokenTypeByte
  23. )
  24. type Tokenizer struct {
  25. *Vocabulary
  26. SpecialVocabulary []*SpecialVocabulary
  27. Merges []string
  28. Pre string
  29. Template string
  30. }
  31. func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) {
  32. v, err := parseVocabulary(fsys)
  33. if err != nil {
  34. return nil, err
  35. }
  36. t := &Tokenizer{
  37. Vocabulary: v,
  38. Pre: "default",
  39. }
  40. addedTokens := make(map[string]token)
  41. if f, err := fsys.Open("tokenizer.json"); errors.Is(err, os.ErrNotExist) {
  42. } else if err != nil {
  43. return nil, err
  44. } else {
  45. defer f.Close()
  46. var tt tokenizer
  47. if err := json.NewDecoder(f).Decode(&tt); err != nil {
  48. return nil, err
  49. }
  50. for _, t := range tt.AddedTokens {
  51. addedTokens[t.Content] = t
  52. }
  53. if len(tt.Model.Merges) == 0 {
  54. // noop; merges is empty
  55. } else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
  56. // noop; merges is []string
  57. } else if merges, err := func() ([][]string, error) {
  58. var merges [][]string
  59. if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
  60. return nil, err
  61. }
  62. return merges, nil
  63. }(); err == nil {
  64. t.Merges = make([]string, len(merges))
  65. for i := range merges {
  66. t.Merges[i] = strings.Join(merges[i], " ")
  67. }
  68. } else {
  69. return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
  70. }
  71. sha256sum := sha256.New()
  72. for _, pt := range tt.PreTokenizer.PreTokenizers {
  73. switch pt.Type {
  74. case "Split":
  75. if pt.Pattern.Regex != "" {
  76. // create a checksum of all Split pretokenizers which should be sufficient
  77. // to identify the pretokenizer
  78. sha256sum.Write([]byte(pt.Pattern.Regex))
  79. }
  80. }
  81. }
  82. switch digest := hex.EncodeToString(sha256sum.Sum(nil)); digest {
  83. case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f":
  84. t.Pre = "llama-bpe"
  85. case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02":
  86. t.Pre = "deepseek-llm"
  87. case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
  88. t.Pre = "deepseek-coder"
  89. case "1ff7f41064896984db5d1bb6ff64fa4bc29007d08c1b439e505b7392777a319e":
  90. t.Pre = "qwen2"
  91. case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
  92. // noop, empty pretokenizer
  93. default:
  94. slog.Warn("unknown pretokenizer, using default", "digest", digest)
  95. }
  96. }
  97. if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
  98. } else if err != nil {
  99. return nil, err
  100. } else {
  101. defer f.Close()
  102. var p map[string]json.RawMessage
  103. if err := json.NewDecoder(f).Decode(&p); err != nil {
  104. return nil, err
  105. }
  106. if template, ok := p["chat_template"]; ok {
  107. var s []struct {
  108. Name string `json:"name"`
  109. Template string `json:"template"`
  110. }
  111. if err := json.Unmarshal(template, &t.Template); err == nil {
  112. // noop
  113. } else if err := json.Unmarshal(template, &s); err == nil {
  114. for _, e := range s {
  115. if e.Name == "default" {
  116. t.Template = e.Template
  117. break
  118. }
  119. }
  120. } else {
  121. return nil, fmt.Errorf("invalid chat_template: %w", err)
  122. }
  123. }
  124. for _, st := range specialTokenTypes {
  125. sv := SpecialVocabulary{Type: st}
  126. if bts, ok := p[fmt.Sprintf("add_%s_token", st)]; ok {
  127. if err := json.Unmarshal(bts, &sv.AddToken); err != nil {
  128. return nil, err
  129. }
  130. }
  131. if bts, ok := p[fmt.Sprintf("%s_token", st)]; ok {
  132. var content string
  133. if err := json.Unmarshal(bts, &content); err != nil {
  134. var mm map[string]any
  135. if err := json.Unmarshal(bts, &mm); err != nil {
  136. continue
  137. }
  138. content, ok = mm["content"].(string)
  139. if !ok {
  140. continue
  141. }
  142. }
  143. sv.Content = content
  144. }
  145. if id, ok := addedTokens[sv.Content]; ok {
  146. sv.ID = id.ID
  147. t.SpecialVocabulary = append(t.SpecialVocabulary, &sv)
  148. }
  149. }
  150. }
  151. return t, nil
  152. }
  153. type tokenizer struct {
  154. AddedTokens []token `json:"added_tokens"`
  155. Model struct {
  156. Type string `json:"type"`
  157. Vocab map[string]int `json:"vocab"`
  158. Merges json.RawMessage `json:"merges"`
  159. } `json:"model"`
  160. PreTokenizer struct {
  161. PreTokenizers []struct {
  162. Type string `json:"type"`
  163. Pattern struct {
  164. Regex string `json:"Regex"`
  165. } `json:"pattern"`
  166. } `json:"pretokenizers"`
  167. } `json:"pre_tokenizer"`
  168. }
  169. type token struct {
  170. ID int `json:"id"`
  171. Content string `json:"content"`
  172. Special bool `json:"special"`
  173. UserDefined bool
  174. }
  175. type Vocabulary struct {
  176. Model string
  177. Tokens []string
  178. Scores []float32
  179. Types []int32
  180. }
  181. func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
  182. f, err := fsys.Open("tokenizer.json")
  183. if err != nil {
  184. return nil, err
  185. }
  186. defer f.Close()
  187. var t tokenizer
  188. if err := json.NewDecoder(f).Decode(&t); err != nil {
  189. return nil, err
  190. }
  191. tokens := make(map[int]token, len(t.Model.Vocab))
  192. for k, v := range t.Model.Vocab {
  193. tokens[v] = token{
  194. ID: v,
  195. Content: k,
  196. }
  197. }
  198. for _, token := range t.AddedTokens {
  199. token.UserDefined = true
  200. tokens[token.ID] = token
  201. }
  202. keys := maps.Keys(tokens)
  203. slices.Sort(keys)
  204. v := Vocabulary{Model: "gpt2"}
  205. for _, k := range keys {
  206. token := tokens[k]
  207. v.Tokens = append(v.Tokens, token.Content)
  208. v.Scores = append(v.Scores, float32(token.ID))
  209. switch {
  210. case token.Special:
  211. v.Types = append(v.Types, tokenTypeControl)
  212. case token.UserDefined:
  213. v.Types = append(v.Types, tokenTypeUserDefined)
  214. default:
  215. v.Types = append(v.Types, tokenTypeNormal)
  216. }
  217. }
  218. return &v, nil
  219. }
  220. func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
  221. patterns := []struct {
  222. Pattern string
  223. Func func(fs.FS) (*Vocabulary, error)
  224. }{
  225. {"tokenizer.model", parseSentencePiece},
  226. {"tokenizer.json", parseVocabularyFromTokenizer},
  227. }
  228. for _, pattern := range patterns {
  229. if _, err := fs.Stat(fsys, pattern.Pattern); errors.Is(err, os.ErrNotExist) {
  230. continue
  231. } else if err != nil {
  232. return nil, err
  233. }
  234. return pattern.Func(fsys)
  235. }
  236. return nil, errors.New("unknown tokenizer format")
  237. }
  238. type SpecialVocabulary struct {
  239. Type string
  240. ID int
  241. Content string
  242. AddToken bool
  243. }
  244. func (sv SpecialVocabulary) Key() string {
  245. switch t := sv.Type; t {
  246. case "bos", "eos", "cls", "mask":
  247. return t
  248. case "unk":
  249. return "unknown"
  250. case "sep":
  251. //nolint:misspell // this is an upstream typo
  252. return "seperator"
  253. case "pad":
  254. return "padding"
  255. }
  256. panic("unknown special vocabulary type")
  257. }