process_text_spm.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. package model
  2. import (
  3. "fmt"
  4. "iter"
  5. "log/slog"
  6. "strings"
  7. //"unicode/utf8"
  8. "github.com/dlclark/regexp2"
  9. queue "github.com/emirpasic/gods/queues/priorityqueue"
  10. )
  11. const spmWhitespaceSep = "▁"
  12. func replaceWhitespaceBySeperator(s string) string {
  13. return strings.ReplaceAll(s, " ", spmWhitespaceSep)
  14. }
  15. type SentencePieceModel struct {
  16. maxTokenLen int
  17. pre *regexp2.Regexp
  18. vocab *Vocabulary
  19. }
  20. func NewSentencePieceModel(pre string, vocab *Vocabulary) SentencePieceModel {
  21. fmt.Printf("Tokens (%d): %5s %5s %5s ...\n", len(vocab.Values), vocab.Values[0], vocab.Values[1], vocab.Values[2])
  22. fmt.Printf("Scores (%d): %0.3f %0.3f %0.3f ...\n", len(vocab.Scores), vocab.Scores[0], vocab.Scores[1], vocab.Scores[2])
  23. fmt.Printf("Types (%d): %5d %5d %5d ...\n", len(vocab.Types), vocab.Types[0], vocab.Types[1], vocab.Types[2])
  24. counter := map[int]int{}
  25. var maxTokenLen int
  26. for cnt, _ := range vocab.Types {
  27. switch vocab.Types[cnt] {
  28. case TOKEN_TYPE_NORMAL, TOKEN_TYPE_USER_DEFINED, TOKEN_TYPE_UNUSED:
  29. maxTokenLen = max(maxTokenLen, len(vocab.Values[cnt]))
  30. fallthrough
  31. default:
  32. counter[int(vocab.Types[cnt])] += 1
  33. }
  34. }
  35. fmt.Printf("Normal: %d\n", counter[TOKEN_TYPE_NORMAL])
  36. fmt.Printf("Unknown: %d\n", counter[TOKEN_TYPE_UNKNOWN])
  37. fmt.Printf("Control: %d\n", counter[TOKEN_TYPE_CONTROL])
  38. fmt.Printf("User Defined: %d\n", counter[TOKEN_TYPE_USER_DEFINED])
  39. fmt.Printf("Unused: %d\n", counter[TOKEN_TYPE_UNUSED])
  40. fmt.Printf("Byte: %d\n", counter[TOKEN_TYPE_BYTE])
  41. fmt.Printf("Max token len: %d\n", maxTokenLen)
  42. return SentencePieceModel{
  43. maxTokenLen: maxTokenLen,
  44. pre: regexp2.MustCompile(pre, regexp2.Unicode|regexp2.RE2),
  45. vocab: vocab,
  46. }
  47. }
  48. func (spm SentencePieceModel) Is(id int32, special Special) bool {
  49. return spm.vocab.Is(id, special)
  50. }
  51. func (spm *SentencePieceModel) split(s string) iter.Seq[string] {
  52. return func(yield func(string) bool) {
  53. for m, _ := spm.pre.FindStringMatch(s); m != nil; m, _ = spm.pre.FindNextMatch(m) {
  54. if !yield(m.String()) {
  55. break
  56. }
  57. }
  58. }
  59. }
  60. func (spm SentencePieceModel) Encode(s string) ([]int32, error) {
  61. fragments := []fragment{{value: s}}
  62. for _, special := range spm.vocab.SpecialVocabulary() {
  63. // TODO: process special tokens concurrently
  64. id := spm.vocab.Encode(special)
  65. for i := 0; i < len(fragments); i++ {
  66. frag := fragments[i]
  67. if len(frag.ids) > 0 {
  68. continue
  69. }
  70. var middle []fragment
  71. switch i := strings.Index(frag.value, special); {
  72. case i < 0:
  73. middle = append(middle, frag)
  74. case i > 0:
  75. middle = append(middle, fragment{value: frag.value[:i]})
  76. fallthrough
  77. default:
  78. middle = append(middle, fragment{value: special, ids: []int32{id}})
  79. if rest := frag.value[i+len(special):]; rest != "" {
  80. middle = append(middle, fragment{value: rest})
  81. }
  82. }
  83. fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
  84. }
  85. }
  86. fmt.Printf("frags = %#v\n", fragments)
  87. var ids []int32
  88. for _, frag := range fragments {
  89. if len(frag.ids) > 0 {
  90. ids = append(ids, frag.ids...)
  91. continue
  92. }
  93. for split := range spm.split(frag.value) {
  94. split = replaceWhitespaceBySeperator(split)
  95. var sb strings.Builder
  96. sb.Write([]byte(split))
  97. if id := spm.vocab.Encode(sb.String()); id >= 0 {
  98. ids = append(ids, id)
  99. continue
  100. }
  101. runes := []rune(sb.String())
  102. pq := queue.NewWith(func(a, b any) int {
  103. priA := a.(*candidate)
  104. priB := b.(*candidate)
  105. if priA.score > priB.score || (priA.score == priB.score && priA.a < priB.a) {
  106. return 1
  107. }
  108. return -1
  109. })
  110. merges := make([]merge, len(runes))
  111. for r := range runes {
  112. merges[r] = merge{
  113. p: r - 1,
  114. n: r + 1,
  115. runes: []rune{runes[r]},
  116. }
  117. }
  118. fmt.Printf("remaining runes = %#v\n", runes)
  119. fmt.Printf("merges = %#v\n", merges)
  120. pairwise := func(a, b int) *candidate {
  121. if a < 0 || b >= len(runes) {
  122. return nil
  123. }
  124. left, right := string(merges[a].runes), string(merges[b].runes)
  125. fmt.Printf("looking up '%s'\n", left+right)
  126. if id := spm.vocab.Encode(left + right); id >= 0 {
  127. return &candidate{
  128. a: a,
  129. b: b,
  130. length: len(left + " " + right),
  131. score: spm.vocab.Scores[id],
  132. }
  133. }
  134. return nil
  135. }
  136. for i := range len(runes) - 1 {
  137. if pair := pairwise(i, i+1); pair != nil {
  138. pq.Enqueue(pair)
  139. }
  140. }
  141. pqv := pq.Values()
  142. for _, v := range pqv {
  143. e := v.(*candidate)
  144. fmt.Printf("candidate = %#v\n", e)
  145. }
  146. for !pq.Empty() {
  147. v, _ := pq.Dequeue()
  148. pair := v.(*candidate)
  149. left, right := merges[pair.a], merges[pair.b]
  150. if len(left.runes) == 0 || len(right.runes) == 0 {
  151. continue
  152. }
  153. merges[pair.a].runes = append(left.runes, right.runes...)
  154. merges[pair.b].runes = nil
  155. merges[pair.a].n = right.n
  156. if right.n < len(merges) {
  157. merges[right.n].p = pair.a
  158. }
  159. if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
  160. pq.Enqueue(pair)
  161. }
  162. if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
  163. pq.Enqueue(pair)
  164. }
  165. }
  166. fmt.Printf("merges = %#v\n", merges)
  167. for _, merge := range merges {
  168. if len(merge.runes) > 0 {
  169. if id := spm.vocab.Encode(string(merge.runes)); id >= 0 {
  170. ids = append(ids, id)
  171. } else {
  172. fmt.Printf("!!! missing token for '%s'\n", string(merge.runes))
  173. }
  174. }
  175. }
  176. }
  177. }
  178. fmt.Printf("tokens = %#v\n", ids)
  179. return ids, nil
  180. }
  181. type candidate struct {
  182. a, b int
  183. score float32
  184. length int
  185. }
  186. func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
  187. var sb strings.Builder
  188. for _, id := range ids {
  189. data := spm.vocab.Decode(id)
  190. data = strings.ReplaceAll(data, spmWhitespaceSep, " ")
  191. if _, err := sb.WriteString(data); err != nil {
  192. return "", err
  193. }
  194. }
  195. slog.Debug("decoded", "ids", ids, "text", sb.String())
  196. return sb.String(), nil
  197. }