|
@@ -8,7 +8,9 @@ import (
|
|
|
"io"
|
|
|
"strconv"
|
|
|
"strings"
|
|
|
- "unicode"
|
|
|
+
|
|
|
+ "golang.org/x/text/encoding/unicode"
|
|
|
+ "golang.org/x/text/transform"
|
|
|
)
|
|
|
|
|
|
type File struct {
|
|
@@ -69,14 +71,11 @@ func ParseFile(r io.Reader) (*File, error) {
|
|
|
var b bytes.Buffer
|
|
|
var role string
|
|
|
|
|
|
- var lineCount int
|
|
|
- var linePos int
|
|
|
-
|
|
|
- var utf16 bool
|
|
|
-
|
|
|
var f File
|
|
|
|
|
|
- br := bufio.NewReader(r)
|
|
|
+ tr := unicode.BOMOverride(unicode.UTF8.NewDecoder())
|
|
|
+ br := bufio.NewReader(transform.NewReader(r, tr))
|
|
|
+
|
|
|
for {
|
|
|
r, _, err := br.ReadRune()
|
|
|
if errors.Is(err, io.EOF) {
|
|
@@ -85,17 +84,6 @@ func ParseFile(r io.Reader) (*File, error) {
|
|
|
return nil, err
|
|
|
}
|
|
|
|
|
|
- // the utf16 byte order mark will be read as "unreadable" by ReadRune()
|
|
|
- if isUnreadable(r) && lineCount == 0 && linePos == 0 {
|
|
|
- utf16 = true
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- // skip the second byte if we're reading utf16
|
|
|
- if utf16 && r == 0 {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
next, r, err := parseRuneForState(r, curr)
|
|
|
if errors.Is(err, io.ErrUnexpectedEOF) {
|
|
|
return nil, fmt.Errorf("%w: %s", err, b.String())
|
|
@@ -103,13 +91,6 @@ func ParseFile(r io.Reader) (*File, error) {
|
|
|
return nil, err
|
|
|
}
|
|
|
|
|
|
- if isNewline(r) {
|
|
|
- lineCount++
|
|
|
- linePos = 0
|
|
|
- } else {
|
|
|
- linePos++
|
|
|
- }
|
|
|
-
|
|
|
// process the state transition, some transitions need to be intercepted and redirected
|
|
|
if next != curr {
|
|
|
switch curr {
|
|
@@ -309,10 +290,6 @@ func isNewline(r rune) bool {
|
|
|
return r == '\r' || r == '\n'
|
|
|
}
|
|
|
|
|
|
-func isUnreadable(r rune) bool {
|
|
|
- return r == unicode.ReplacementChar
|
|
|
-}
|
|
|
-
|
|
|
func isValidMessageRole(role string) bool {
|
|
|
return role == "system" || role == "user" || role == "assistant"
|
|
|
}
|