tokenizer.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. package convert
  2. import (
  3. "crypto/sha256"
  4. "encoding/hex"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "io/fs"
  9. "log/slog"
  10. "os"
  11. "slices"
  12. "strings"
  13. "golang.org/x/exp/maps"
  14. )
  15. const (
  16. _ int32 = iota
  17. tokenTypeNormal
  18. tokenTypeUnknown
  19. tokenTypeControl
  20. tokenTypeUserDefined
  21. tokenTypeUnused
  22. tokenTypeByte
  23. )
  24. type Tokenizer struct {
  25. *Vocabulary
  26. SpecialVocabulary []*SpecialVocabulary
  27. Merges []string
  28. Pre string
  29. Template string
  30. }
  31. func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error) {
  32. v, err := parseVocabulary(fsys)
  33. if err != nil {
  34. return nil, err
  35. }
  36. t := &Tokenizer{
  37. Vocabulary: v,
  38. Pre: "default",
  39. }
  40. addedTokens := make(map[string]token)
  41. if f, err := fsys.Open("tokenizer.json"); errors.Is(err, os.ErrNotExist) {
  42. } else if err != nil {
  43. return nil, err
  44. } else {
  45. defer f.Close()
  46. var tt tokenizer
  47. if err := json.NewDecoder(f).Decode(&tt); err != nil {
  48. return nil, err
  49. }
  50. for _, t := range tt.AddedTokens {
  51. addedTokens[t.Content] = t
  52. }
  53. if len(tt.Model.Merges) == 0 {
  54. // noop; merges is empty
  55. } else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
  56. // noop; merges is []string
  57. } else if merges, err := func() ([][]string, error) {
  58. var merges [][]string
  59. if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
  60. return nil, err
  61. }
  62. return merges, nil
  63. }(); err == nil {
  64. t.Merges = make([]string, len(merges))
  65. for i := range merges {
  66. t.Merges[i] = strings.Join(merges[i], " ")
  67. }
  68. } else {
  69. return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
  70. }
  71. sha256sum := sha256.New()
  72. for _, pt := range tt.PreTokenizer.PreTokenizers {
  73. switch pt.Type {
  74. case "Split":
  75. if pt.Pattern.Regex != "" {
  76. // create a checksum of all Split pretokenizers which should be sufficient
  77. // to identify the pretokenizer
  78. sha256sum.Write([]byte(pt.Pattern.Regex))
  79. }
  80. }
  81. }
  82. switch digest := hex.EncodeToString(sha256sum.Sum(nil)); digest {
  83. case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f":
  84. t.Pre = "llama-bpe"
  85. case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02":
  86. t.Pre = "deepseek-llm"
  87. case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
  88. t.Pre = "deepseek-coder"
  89. case "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855":
  90. // noop, empty pretokenizer
  91. default:
  92. slog.Warn("unknown pretokenizer, using default", "digest", digest)
  93. }
  94. }
  95. if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
  96. } else if err != nil {
  97. return nil, err
  98. } else {
  99. defer f.Close()
  100. var p map[string]json.RawMessage
  101. if err := json.NewDecoder(f).Decode(&p); err != nil {
  102. return nil, err
  103. }
  104. if template, ok := p["chat_template"]; ok {
  105. var s []struct {
  106. Name string `json:"name"`
  107. Template string `json:"template"`
  108. }
  109. if err := json.Unmarshal(template, &t.Template); err == nil {
  110. // noop
  111. } else if err := json.Unmarshal(template, &s); err == nil {
  112. for _, e := range s {
  113. if e.Name == "default" {
  114. t.Template = e.Template
  115. break
  116. }
  117. }
  118. } else {
  119. return nil, fmt.Errorf("invalid chat_template: %w", err)
  120. }
  121. }
  122. for _, st := range specialTokenTypes {
  123. sv := SpecialVocabulary{Type: st}
  124. if bts, ok := p[fmt.Sprintf("add_%s_token", st)]; ok {
  125. if err := json.Unmarshal(bts, &sv.AddToken); err != nil {
  126. return nil, err
  127. }
  128. }
  129. if bts, ok := p[fmt.Sprintf("%s_token", st)]; ok {
  130. var content string
  131. if err := json.Unmarshal(bts, &content); err != nil {
  132. var mm map[string]any
  133. if err := json.Unmarshal(bts, &mm); err != nil {
  134. continue
  135. }
  136. content, ok = mm["content"].(string)
  137. if !ok {
  138. continue
  139. }
  140. }
  141. sv.Content = content
  142. }
  143. if id, ok := addedTokens[sv.Content]; ok {
  144. sv.ID = id.ID
  145. t.SpecialVocabulary = append(t.SpecialVocabulary, &sv)
  146. }
  147. }
  148. }
  149. return t, nil
  150. }
  151. type tokenizer struct {
  152. AddedTokens []token `json:"added_tokens"`
  153. Model struct {
  154. Type string `json:"type"`
  155. Vocab map[string]int `json:"vocab"`
  156. Merges json.RawMessage `json:"merges"`
  157. } `json:"model"`
  158. PreTokenizer struct {
  159. PreTokenizers []struct {
  160. Type string `json:"type"`
  161. Pattern struct {
  162. Regex string `json:"Regex"`
  163. } `json:"pattern"`
  164. } `json:"pretokenizers"`
  165. } `json:"pre_tokenizer"`
  166. }
  167. type token struct {
  168. ID int `json:"id"`
  169. Content string `json:"content"`
  170. Special bool `json:"special"`
  171. UserDefined bool
  172. }
  173. type Vocabulary struct {
  174. Model string
  175. Tokens []string
  176. Scores []float32
  177. Types []int32
  178. }
  179. func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
  180. f, err := fsys.Open("tokenizer.json")
  181. if err != nil {
  182. return nil, err
  183. }
  184. defer f.Close()
  185. var t tokenizer
  186. if err := json.NewDecoder(f).Decode(&t); err != nil {
  187. return nil, err
  188. }
  189. tokens := make(map[int]token, len(t.Model.Vocab))
  190. for k, v := range t.Model.Vocab {
  191. tokens[v] = token{
  192. ID: v,
  193. Content: k,
  194. }
  195. }
  196. for _, token := range t.AddedTokens {
  197. token.UserDefined = true
  198. tokens[token.ID] = token
  199. }
  200. keys := maps.Keys(tokens)
  201. slices.Sort(keys)
  202. v := Vocabulary{Model: "gpt2"}
  203. for _, k := range keys {
  204. token := tokens[k]
  205. v.Tokens = append(v.Tokens, token.Content)
  206. v.Scores = append(v.Scores, float32(token.ID))
  207. switch {
  208. case token.Special:
  209. v.Types = append(v.Types, tokenTypeControl)
  210. case token.UserDefined:
  211. v.Types = append(v.Types, tokenTypeUserDefined)
  212. default:
  213. v.Types = append(v.Types, tokenTypeNormal)
  214. }
  215. }
  216. return &v, nil
  217. }
  218. func parseVocabulary(fsys fs.FS) (*Vocabulary, error) {
  219. patterns := []struct {
  220. Pattern string
  221. Func func(fs.FS) (*Vocabulary, error)
  222. }{
  223. {"tokenizer.model", parseSentencePiece},
  224. {"tokenizer.json", parseVocabularyFromTokenizer},
  225. }
  226. for _, pattern := range patterns {
  227. if _, err := fs.Stat(fsys, pattern.Pattern); errors.Is(err, os.ErrNotExist) {
  228. continue
  229. } else if err != nil {
  230. return nil, err
  231. }
  232. return pattern.Func(fsys)
  233. }
  234. return nil, errors.New("unknown tokenizer format")
  235. }
  236. type SpecialVocabulary struct {
  237. Type string
  238. ID int
  239. Content string
  240. AddToken bool
  241. }
  242. func (sv SpecialVocabulary) Key() string {
  243. switch t := sv.Type; t {
  244. case "bos", "eos", "cls", "mask":
  245. return t
  246. case "unk":
  247. return "unknown"
  248. case "sep":
  249. //nolint:misspell // this is an upstream typo
  250. return "seperator"
  251. case "pad":
  252. return "padding"
  253. }
  254. panic("unknown special vocabulary type")
  255. }