parser.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. package parser
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "strconv"
  9. "strings"
  10. "unicode"
  11. )
  12. type File struct {
  13. Commands []Command
  14. }
  15. func (f File) String() string {
  16. var sb strings.Builder
  17. for _, cmd := range f.Commands {
  18. fmt.Fprintln(&sb, cmd.String())
  19. }
  20. return sb.String()
  21. }
  22. type Command struct {
  23. Name string
  24. Args string
  25. }
  26. func (c Command) String() string {
  27. var sb strings.Builder
  28. switch c.Name {
  29. case "model":
  30. fmt.Fprintf(&sb, "FROM %s", c.Args)
  31. case "license", "template", "system", "adapter":
  32. fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
  33. case "message":
  34. role, message, _ := strings.Cut(c.Args, ": ")
  35. fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
  36. default:
  37. fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
  38. }
  39. return sb.String()
  40. }
  41. type state int
  42. const (
  43. stateNil state = iota
  44. stateName
  45. stateValue
  46. stateParameter
  47. stateMessage
  48. stateComment
  49. )
  50. var (
  51. errMissingFrom = errors.New("no FROM line")
  52. errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
  53. errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
  54. )
  55. func ParseFile(r io.Reader) (*File, error) {
  56. var cmd Command
  57. var curr state
  58. var b bytes.Buffer
  59. var role string
  60. var lineCount int
  61. var linePos int
  62. var utf16 bool
  63. var f File
  64. br := bufio.NewReader(r)
  65. for {
  66. r, _, err := br.ReadRune()
  67. if errors.Is(err, io.EOF) {
  68. break
  69. } else if err != nil {
  70. return nil, err
  71. }
  72. // the utf16 byte order mark will be read as "unreadable" by ReadRune()
  73. if isUnreadable(r) && lineCount == 0 && linePos == 0 {
  74. utf16 = true
  75. continue
  76. }
  77. // skip the second byte if we're reading utf16
  78. if utf16 && r == 0 {
  79. continue
  80. }
  81. next, r, err := parseRuneForState(r, curr)
  82. if errors.Is(err, io.ErrUnexpectedEOF) {
  83. return nil, fmt.Errorf("%w: %s", err, b.String())
  84. } else if err != nil {
  85. return nil, err
  86. }
  87. if isNewline(r) {
  88. lineCount++
  89. linePos = 0
  90. } else {
  91. linePos++
  92. }
  93. // process the state transition, some transitions need to be intercepted and redirected
  94. if next != curr {
  95. switch curr {
  96. case stateName:
  97. if !isValidCommand(b.String()) {
  98. return nil, errInvalidCommand
  99. }
  100. // next state sometimes depends on the current buffer value
  101. switch s := strings.ToLower(b.String()); s {
  102. case "from":
  103. cmd.Name = "model"
  104. case "parameter":
  105. // transition to stateParameter which sets command name
  106. next = stateParameter
  107. case "message":
  108. // transition to stateMessage which validates the message role
  109. next = stateMessage
  110. fallthrough
  111. default:
  112. cmd.Name = s
  113. }
  114. case stateParameter:
  115. cmd.Name = b.String()
  116. case stateMessage:
  117. if !isValidMessageRole(b.String()) {
  118. return nil, errInvalidMessageRole
  119. }
  120. role = b.String()
  121. case stateComment, stateNil:
  122. // pass
  123. case stateValue:
  124. s, ok := unquote(b.String())
  125. if !ok || isSpace(r) {
  126. if _, err := b.WriteRune(r); err != nil {
  127. return nil, err
  128. }
  129. continue
  130. }
  131. if role != "" {
  132. s = role + ": " + s
  133. role = ""
  134. }
  135. cmd.Args = s
  136. f.Commands = append(f.Commands, cmd)
  137. }
  138. b.Reset()
  139. curr = next
  140. }
  141. if strconv.IsPrint(r) {
  142. if _, err := b.WriteRune(r); err != nil {
  143. return nil, err
  144. }
  145. }
  146. }
  147. // flush the buffer
  148. switch curr {
  149. case stateComment, stateNil:
  150. // pass; nothing to flush
  151. case stateValue:
  152. s, ok := unquote(b.String())
  153. if !ok {
  154. return nil, io.ErrUnexpectedEOF
  155. }
  156. if role != "" {
  157. s = role + ": " + s
  158. }
  159. cmd.Args = s
  160. f.Commands = append(f.Commands, cmd)
  161. default:
  162. return nil, io.ErrUnexpectedEOF
  163. }
  164. for _, cmd := range f.Commands {
  165. if cmd.Name == "model" {
  166. return &f, nil
  167. }
  168. }
  169. return nil, errMissingFrom
  170. }
  171. func parseRuneForState(r rune, cs state) (state, rune, error) {
  172. switch cs {
  173. case stateNil:
  174. switch {
  175. case r == '#':
  176. return stateComment, 0, nil
  177. case isSpace(r), isNewline(r):
  178. return stateNil, 0, nil
  179. default:
  180. return stateName, r, nil
  181. }
  182. case stateName:
  183. switch {
  184. case isAlpha(r):
  185. return stateName, r, nil
  186. case isSpace(r):
  187. return stateValue, 0, nil
  188. default:
  189. return stateNil, 0, errInvalidCommand
  190. }
  191. case stateValue:
  192. switch {
  193. case isNewline(r):
  194. return stateNil, r, nil
  195. case isSpace(r):
  196. return stateNil, r, nil
  197. default:
  198. return stateValue, r, nil
  199. }
  200. case stateParameter:
  201. switch {
  202. case isAlpha(r), isNumber(r), r == '_':
  203. return stateParameter, r, nil
  204. case isSpace(r):
  205. return stateValue, 0, nil
  206. default:
  207. return stateNil, 0, io.ErrUnexpectedEOF
  208. }
  209. case stateMessage:
  210. switch {
  211. case isAlpha(r):
  212. return stateMessage, r, nil
  213. case isSpace(r):
  214. return stateValue, 0, nil
  215. default:
  216. return stateNil, 0, io.ErrUnexpectedEOF
  217. }
  218. case stateComment:
  219. switch {
  220. case isNewline(r):
  221. return stateNil, 0, nil
  222. default:
  223. return stateComment, 0, nil
  224. }
  225. default:
  226. return stateNil, 0, errors.New("")
  227. }
  228. }
  229. func quote(s string) string {
  230. if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
  231. if strings.Contains(s, "\"") {
  232. return `"""` + s + `"""`
  233. }
  234. return `"` + s + `"`
  235. }
  236. return s
  237. }
  238. func unquote(s string) (string, bool) {
  239. // TODO: single quotes
  240. if len(s) >= 3 && s[:3] == `"""` {
  241. if len(s) >= 6 && s[len(s)-3:] == `"""` {
  242. return s[3 : len(s)-3], true
  243. }
  244. return "", false
  245. }
  246. if len(s) >= 1 && s[0] == '"' {
  247. if len(s) >= 2 && s[len(s)-1] == '"' {
  248. return s[1 : len(s)-1], true
  249. }
  250. return "", false
  251. }
  252. return s, true
  253. }
  254. func isAlpha(r rune) bool {
  255. return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
  256. }
  257. func isNumber(r rune) bool {
  258. return r >= '0' && r <= '9'
  259. }
  260. func isSpace(r rune) bool {
  261. return r == ' ' || r == '\t'
  262. }
  263. func isNewline(r rune) bool {
  264. return r == '\r' || r == '\n'
  265. }
  266. func isUnreadable(r rune) bool {
  267. return r == unicode.ReplacementChar
  268. }
  269. func isValidMessageRole(role string) bool {
  270. return role == "system" || role == "user" || role == "assistant"
  271. }
  272. func isValidCommand(cmd string) bool {
  273. switch strings.ToLower(cmd) {
  274. case "from", "license", "template", "system", "adapter", "parameter", "message":
  275. return true
  276. default:
  277. return false
  278. }
  279. }