parser.go 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. package parser
  2. import (
  3. "bufio"
  4. "bytes"
  5. "encoding/binary"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "log/slog"
  10. "strconv"
  11. "strings"
  12. "unicode/utf16"
  13. "unicode/utf8"
  14. )
  15. type File struct {
  16. Commands []Command
  17. }
  18. func (f File) String() string {
  19. var sb strings.Builder
  20. for _, cmd := range f.Commands {
  21. fmt.Fprintln(&sb, cmd.String())
  22. }
  23. return sb.String()
  24. }
  25. type Command struct {
  26. Name string
  27. Args string
  28. }
  29. func (c Command) String() string {
  30. var sb strings.Builder
  31. switch c.Name {
  32. case "model":
  33. fmt.Fprintf(&sb, "FROM %s", c.Args)
  34. case "license", "template", "system", "adapter":
  35. fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
  36. case "message":
  37. role, message, _ := strings.Cut(c.Args, ": ")
  38. fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
  39. default:
  40. fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
  41. }
  42. return sb.String()
  43. }
  44. type state int
  45. const (
  46. stateNil state = iota
  47. stateName
  48. stateValue
  49. stateParameter
  50. stateMessage
  51. stateComment
  52. )
  53. var (
  54. errMissingFrom = errors.New("no FROM line")
  55. errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
  56. errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
  57. )
  58. func ParseFile(r io.Reader) (*File, error) {
  59. var cmd Command
  60. var curr state
  61. var b bytes.Buffer
  62. var role string
  63. var f File
  64. br := bufio.NewReader(r)
  65. var sc scannerDecoder = utf8ScannerDecoder{}
  66. if bom, err := br.Peek(2); err != nil {
  67. slog.Warn("error reading byte-order mark", "error", err)
  68. } else if bytes.Equal(bom, []byte{0xFE, 0xFF}) {
  69. sc = utf16ScannerDecoder{binary.LittleEndian}
  70. //nolint:errcheck
  71. br.Discard(2)
  72. } else if bytes.Equal(bom, []byte{0xFF, 0xFE}) {
  73. sc = utf16ScannerDecoder{binary.BigEndian}
  74. //nolint:errcheck
  75. br.Discard(2)
  76. }
  77. scanner := bufio.NewScanner(br)
  78. scanner.Split(sc.ScanBytes)
  79. for scanner.Scan() {
  80. r, err := sc.DecodeRune(scanner.Bytes())
  81. if err != nil {
  82. return nil, err
  83. }
  84. next, r, err := parseRuneForState(r, curr)
  85. if errors.Is(err, io.ErrUnexpectedEOF) {
  86. return nil, fmt.Errorf("%w: %s", err, b.String())
  87. } else if err != nil {
  88. return nil, err
  89. }
  90. // process the state transition, some transitions need to be intercepted and redirected
  91. if next != curr {
  92. switch curr {
  93. case stateName:
  94. if !isValidCommand(b.String()) {
  95. return nil, errInvalidCommand
  96. }
  97. // next state sometimes depends on the current buffer value
  98. switch s := strings.ToLower(b.String()); s {
  99. case "from":
  100. cmd.Name = "model"
  101. case "parameter":
  102. // transition to stateParameter which sets command name
  103. next = stateParameter
  104. case "message":
  105. // transition to stateMessage which validates the message role
  106. next = stateMessage
  107. fallthrough
  108. default:
  109. cmd.Name = s
  110. }
  111. case stateParameter:
  112. cmd.Name = b.String()
  113. case stateMessage:
  114. if !isValidMessageRole(b.String()) {
  115. return nil, errInvalidMessageRole
  116. }
  117. role = b.String()
  118. case stateComment, stateNil:
  119. // pass
  120. case stateValue:
  121. s, ok := unquote(b.String())
  122. if !ok || isSpace(r) {
  123. if _, err := b.WriteRune(r); err != nil {
  124. return nil, err
  125. }
  126. continue
  127. }
  128. if role != "" {
  129. s = role + ": " + s
  130. role = ""
  131. }
  132. cmd.Args = s
  133. f.Commands = append(f.Commands, cmd)
  134. }
  135. b.Reset()
  136. curr = next
  137. }
  138. if strconv.IsPrint(r) {
  139. if _, err := b.WriteRune(r); err != nil {
  140. return nil, err
  141. }
  142. }
  143. }
  144. // flush the buffer
  145. switch curr {
  146. case stateComment, stateNil:
  147. // pass; nothing to flush
  148. case stateValue:
  149. s, ok := unquote(b.String())
  150. if !ok {
  151. return nil, io.ErrUnexpectedEOF
  152. }
  153. if role != "" {
  154. s = role + ": " + s
  155. }
  156. cmd.Args = s
  157. f.Commands = append(f.Commands, cmd)
  158. default:
  159. return nil, io.ErrUnexpectedEOF
  160. }
  161. for _, cmd := range f.Commands {
  162. if cmd.Name == "model" {
  163. return &f, nil
  164. }
  165. }
  166. return nil, errMissingFrom
  167. }
  168. func parseRuneForState(r rune, cs state) (state, rune, error) {
  169. switch cs {
  170. case stateNil:
  171. switch {
  172. case r == '#':
  173. return stateComment, 0, nil
  174. case isSpace(r), isNewline(r):
  175. return stateNil, 0, nil
  176. default:
  177. return stateName, r, nil
  178. }
  179. case stateName:
  180. switch {
  181. case isAlpha(r):
  182. return stateName, r, nil
  183. case isSpace(r):
  184. return stateValue, 0, nil
  185. default:
  186. return stateNil, 0, errInvalidCommand
  187. }
  188. case stateValue:
  189. switch {
  190. case isNewline(r):
  191. return stateNil, r, nil
  192. case isSpace(r):
  193. return stateNil, r, nil
  194. default:
  195. return stateValue, r, nil
  196. }
  197. case stateParameter:
  198. switch {
  199. case isAlpha(r), isNumber(r), r == '_':
  200. return stateParameter, r, nil
  201. case isSpace(r):
  202. return stateValue, 0, nil
  203. default:
  204. return stateNil, 0, io.ErrUnexpectedEOF
  205. }
  206. case stateMessage:
  207. switch {
  208. case isAlpha(r):
  209. return stateMessage, r, nil
  210. case isSpace(r):
  211. return stateValue, 0, nil
  212. default:
  213. return stateNil, 0, io.ErrUnexpectedEOF
  214. }
  215. case stateComment:
  216. switch {
  217. case isNewline(r):
  218. return stateNil, 0, nil
  219. default:
  220. return stateComment, 0, nil
  221. }
  222. default:
  223. return stateNil, 0, errors.New("")
  224. }
  225. }
  226. func quote(s string) string {
  227. if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
  228. if strings.Contains(s, "\"") {
  229. return `"""` + s + `"""`
  230. }
  231. return `"` + s + `"`
  232. }
  233. return s
  234. }
  235. func unquote(s string) (string, bool) {
  236. // TODO: single quotes
  237. if len(s) >= 3 && s[:3] == `"""` {
  238. if len(s) >= 6 && s[len(s)-3:] == `"""` {
  239. return s[3 : len(s)-3], true
  240. }
  241. return "", false
  242. }
  243. if len(s) >= 1 && s[0] == '"' {
  244. if len(s) >= 2 && s[len(s)-1] == '"' {
  245. return s[1 : len(s)-1], true
  246. }
  247. return "", false
  248. }
  249. return s, true
  250. }
  251. func isAlpha(r rune) bool {
  252. return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
  253. }
  254. func isNumber(r rune) bool {
  255. return r >= '0' && r <= '9'
  256. }
  257. func isSpace(r rune) bool {
  258. return r == ' ' || r == '\t'
  259. }
  260. func isNewline(r rune) bool {
  261. return r == '\r' || r == '\n'
  262. }
  263. func isValidMessageRole(role string) bool {
  264. return role == "system" || role == "user" || role == "assistant"
  265. }
  266. func isValidCommand(cmd string) bool {
  267. switch strings.ToLower(cmd) {
  268. case "from", "license", "template", "system", "adapter", "parameter", "message":
  269. return true
  270. default:
  271. return false
  272. }
  273. }
  274. type scannerDecoder interface {
  275. ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error)
  276. DecodeRune([]byte) (rune, error)
  277. }
  278. type utf8ScannerDecoder struct{}
  279. func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
  280. return scanBytesN(data, 1, atEOF)
  281. }
  282. func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) {
  283. r, _ := utf8.DecodeRune(data)
  284. return r, nil
  285. }
  286. type utf16ScannerDecoder struct {
  287. binary.ByteOrder
  288. }
  289. func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
  290. return scanBytesN(data, 2, atEOF)
  291. }
  292. func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) {
  293. return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil
  294. }
  295. func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) {
  296. if atEOF && len(data) == 0 {
  297. return 0, nil, nil
  298. }
  299. return n, data[:n], nil
  300. }