pushdown_automata.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. package sample
  2. import (
  3. "slices"
  4. "github.com/ollama/ollama/model"
  5. )
  6. // TODO: / should be valid but an escape character
  7. var stringInvalidRunes = []rune{'\\', '\n', '\t', '{', '}', ':', ',', '/'}
  8. var intInvalidRunes = []rune{'e', 'E', ' ', '\n', '\t', '{', '}', ':', ',', '"'}
  9. var validIntRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-'}
  10. var validNumberRunes = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', '-', '+', 'e', 'E'}
  11. var validBoolRunes = []rune{'t', 'r', 'u', 'e', 'f', 'a', 'l', 's', 'e'}
  12. var validNullRunes = []rune{'n', 'u', 'l', 'l'}
  13. type PDANode struct {
  14. State JSONState
  15. TransitionEdges map[rune]*PDANode
  16. MaskTokenIDToNode map[int32]JSONState
  17. }
  18. func NewPDANode(state JSONState) *PDANode {
  19. return &PDANode{
  20. State: state,
  21. TransitionEdges: make(map[rune]*PDANode),
  22. MaskTokenIDToNode: make(map[int32]JSONState),
  23. }
  24. }
  25. func BuildGraph(proc model.TextProcessor) (*PDANode, map[JSONState]*PDANode, error) {
  26. stateToNodeMap := make(map[JSONState]*PDANode)
  27. // TODO: make this a loop
  28. startNode := NewPDANode(StateStart)
  29. stateToNodeMap[StateStart] = startNode
  30. objNode := NewPDANode(StateInObject)
  31. stateToNodeMap[StateInObject] = objNode
  32. objEndNode := NewPDANode(StateInObjectEnd)
  33. stateToNodeMap[StateInObjectEnd] = objEndNode
  34. objKeyNode := NewPDANode(StateInObjectKey)
  35. stateToNodeMap[StateInObjectKey] = objKeyNode
  36. objKeyEndNode := NewPDANode(StateInObjectKeyEnd)
  37. stateToNodeMap[StateInObjectKeyEnd] = objKeyEndNode
  38. colonNode := NewPDANode(StateInColon)
  39. stateToNodeMap[StateInColon] = colonNode
  40. commaNode := NewPDANode(StateInComma)
  41. stateToNodeMap[StateInComma] = commaNode
  42. newlineNode := NewPDANode(StateInNewline)
  43. stateToNodeMap[StateInNewline] = newlineNode
  44. spaceNode := NewPDANode(StateInSpace)
  45. stateToNodeMap[StateInSpace] = spaceNode
  46. spaceObjNode := NewPDANode(StateInObjSpace)
  47. stateToNodeMap[StateInObjSpace] = spaceObjNode
  48. tabNode := NewPDANode(StateInTab)
  49. stateToNodeMap[StateInTab] = tabNode
  50. stringNode := NewPDANode(StateInString)
  51. stateToNodeMap[StateInString] = stringNode
  52. stringEndNode := NewPDANode(StateInStringEnd)
  53. stateToNodeMap[StateInStringEnd] = stringEndNode
  54. listNode := NewPDANode(StateInList)
  55. stateToNodeMap[StateInList] = listNode
  56. listCommaNode := NewPDANode(StateInListComma)
  57. stateToNodeMap[StateInListComma] = listCommaNode
  58. listEndNode := NewPDANode(StateListEnd)
  59. stateToNodeMap[StateListEnd] = listEndNode
  60. numberNode := NewPDANode(StateInNumber)
  61. stateToNodeMap[StateInNumber] = numberNode
  62. boolNode := NewPDANode(StateInBool)
  63. stateToNodeMap[StateInBool] = boolNode
  64. nullNode := NewPDANode(StateInNull)
  65. stateToNodeMap[StateInNull] = nullNode
  66. // Defined with structured outputs only
  67. intNode := NewPDANode(StateInInt)
  68. stateToNodeMap[StateInInt] = intNode
  69. listObjEndNode := NewPDANode(StateInListObjectEnd)
  70. stateToNodeMap[StateInListObjectEnd] = listObjEndNode
  71. // TODO:
  72. // consider adding a node to just point to values, could be good to compute that
  73. // mask rather than many different nodes
  74. // Connect nodes
  75. // TODO: if all are single tokens then this can just be connected instead of defining the token
  76. startNode.TransitionEdges['{'] = objNode
  77. objNode.TransitionEdges['"'] = objKeyNode
  78. objNode.TransitionEdges['\n'] = newlineNode
  79. objNode.TransitionEdges[' '] = spaceObjNode
  80. //new line
  81. newlineNode.TransitionEdges['"'] = objKeyNode
  82. newlineNode.TransitionEdges['\t'] = tabNode
  83. tabNode.TransitionEdges['"'] = objKeyNode
  84. objKeyNode.TransitionEdges[rune(-1)] = objKeyNode
  85. objKeyNode.TransitionEdges['"'] = objKeyEndNode
  86. objKeyEndNode.TransitionEdges[':'] = colonNode
  87. objEndNode.TransitionEdges[','] = commaNode
  88. objEndNode.TransitionEdges['}'] = objEndNode
  89. // where values should be
  90. // this could be combined but the probs might change, we're alr doing a skip ahead
  91. colonNode.TransitionEdges[' '] = spaceNode
  92. colonNode.TransitionEdges['['] = listNode
  93. colonNode.TransitionEdges['{'] = objNode
  94. addValueConnections(colonNode, stateToNodeMap)
  95. // Leads to a value
  96. spaceNode.TransitionEdges['['] = listNode
  97. spaceNode.TransitionEdges['{'] = objNode
  98. addValueConnections(spaceNode, stateToNodeMap)
  99. // Values
  100. // string node
  101. stringNode.TransitionEdges[rune(-1)] = stringNode
  102. stringNode.TransitionEdges['"'] = stringEndNode
  103. // String end node
  104. addEnds(stringEndNode, stateToNodeMap)
  105. // TODO: add counters for allowable number of decimals, e, E, etc
  106. // number node
  107. for _, r := range validNumberRunes {
  108. numberNode.TransitionEdges[r] = numberNode
  109. }
  110. addEnds(numberNode, stateToNodeMap)
  111. // bool node
  112. for _, r := range validBoolRunes {
  113. boolNode.TransitionEdges[r] = boolNode
  114. }
  115. addEnds(boolNode, stateToNodeMap)
  116. // list node
  117. listNode.TransitionEdges[','] = commaNode
  118. listNode.TransitionEdges['{'] = objNode
  119. listNode.TransitionEdges[' '] = listNode
  120. listNode.TransitionEdges['\n'] = listNode
  121. addValueConnections(listNode, stateToNodeMap)
  122. // null node
  123. for _, r := range validNullRunes {
  124. nullNode.TransitionEdges[r] = nullNode
  125. }
  126. addEnds(nullNode, stateToNodeMap)
  127. // list comma
  128. // should point to values
  129. listCommaNode.TransitionEdges[' '] = listCommaNode
  130. listCommaNode.TransitionEdges['{'] = objNode
  131. listCommaNode.TransitionEdges['\n'] = newlineNode
  132. addValueConnections(listCommaNode, stateToNodeMap)
  133. // list object end
  134. listObjEndNode.TransitionEdges[','] = listCommaNode
  135. listObjEndNode.TransitionEdges[']'] = listEndNode
  136. // bool node
  137. for _, r := range validBoolRunes {
  138. boolNode.TransitionEdges[r] = boolNode
  139. }
  140. addEnds(boolNode, stateToNodeMap)
  141. listEndNode.TransitionEdges['}'] = objEndNode
  142. listEndNode.TransitionEdges[','] = commaNode
  143. commaNode.TransitionEdges['{'] = objNode
  144. commaNode.TransitionEdges['\n'] = newlineNode
  145. commaNode.TransitionEdges['\t'] = tabNode
  146. commaNode.TransitionEdges['"'] = objKeyNode
  147. commaNode.TransitionEdges[' '] = spaceObjNode
  148. spaceObjNode.TransitionEdges['"'] = objKeyNode
  149. spaceObjNode.TransitionEdges['\n'] = newlineNode
  150. return startNode, stateToNodeMap, nil
  151. }
  152. func addEnds(node *PDANode, stateToNodeMap map[JSONState]*PDANode) {
  153. node.TransitionEdges[','] = stateToNodeMap[StateInComma]
  154. node.TransitionEdges['}'] = stateToNodeMap[StateInObjectEnd]
  155. node.TransitionEdges[']'] = stateToNodeMap[StateListEnd]
  156. }
  157. func addValueConnections(node *PDANode, stateToNodeMap map[JSONState]*PDANode) {
  158. node.TransitionEdges['"'] = stateToNodeMap[StateInString]
  159. for _, r := range validNumberRunes {
  160. node.TransitionEdges[r] = stateToNodeMap[StateInNumber]
  161. }
  162. node.TransitionEdges['t'] = stateToNodeMap[StateInBool]
  163. node.TransitionEdges['f'] = stateToNodeMap[StateInBool]
  164. node.TransitionEdges['n'] = stateToNodeMap[StateInNull]
  165. }
  166. func PreComputeValidStates(stateToNodeMap map[JSONState]*PDANode, proc model.TextProcessor) error {
  167. vocab := proc.GetVocabulary()
  168. decodedToks := make([]string, len(vocab.Values))
  169. for i := range vocab.Values {
  170. token, err := proc.Decode([]int32{int32(i)})
  171. if err != nil {
  172. return err
  173. }
  174. decodedToks[i] = token
  175. }
  176. var err error
  177. for _, node := range stateToNodeMap {
  178. for i := range vocab.Values {
  179. token := decodedToks[i]
  180. // Skip EOS/BOS tokens and empty tokens since they are not valid in JSON
  181. if proc.Is(uint32(i), model.SpecialEOS) || proc.Is(uint32(i), model.SpecialBOS) || token == "" || token == "\"\"" {
  182. continue
  183. }
  184. valid := true
  185. curNode := node
  186. consumedSpecialRunes := make(map[rune]bool)
  187. for _, r := range token {
  188. valid, curNode, err = isRuneValid(r, curNode, consumedSpecialRunes)
  189. if err != nil {
  190. return err
  191. }
  192. if !valid {
  193. break
  194. }
  195. }
  196. if valid {
  197. node.MaskTokenIDToNode[int32(i)] = curNode.State
  198. }
  199. }
  200. }
  201. return nil
  202. }
  203. // garbage interface plz fix
  204. func isRuneValid(r rune, curNode *PDANode, consumedSpecialRunes map[rune]bool) (bool, *PDANode, error) {
  205. if consumedSpecialRunes[r] {
  206. return false, nil, nil
  207. }
  208. specialRune := slices.Contains(stringInvalidRunes, r)
  209. if specialRune {
  210. if curNode.State == StateInString || curNode.State == StateInObjectKey {
  211. return false, nil, nil
  212. }
  213. }
  214. // Check for specific rune transition
  215. if nextNode, ok := curNode.TransitionEdges[r]; ok {
  216. if specialRune {
  217. if curNode.State == nextNode.State {
  218. return false, nil, nil
  219. }
  220. consumedSpecialRunes[r] = true
  221. }
  222. return true, nextNode, nil
  223. }
  224. // Check for sentinel value - if present, any rune is valid
  225. if nextNode, ok := curNode.TransitionEdges[rune(-1)]; ok {
  226. return true, nextNode, nil
  227. }
  228. return false, nil, nil
  229. }