Procházet zdrojové kódy

fix utf16 for multibyte runes

Michael Yang před 10 měsíci
rodič
revize
d528e1af75
1 změnil soubory, kde provedl 6 přidání a 29 odebrání
  1. 6 29
      parser/parser.go

+ 6 - 29
parser/parser.go

@@ -8,7 +8,9 @@ import (
 	"io"
 	"strconv"
 	"strings"
-	"unicode"
+
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 )
 
 type File struct {
@@ -69,14 +71,11 @@ func ParseFile(r io.Reader) (*File, error) {
 	var b bytes.Buffer
 	var role string
 
-	var lineCount int
-	var linePos int
-
-	var utf16 bool
-
 	var f File
 
-	br := bufio.NewReader(r)
+	tr := unicode.BOMOverride(unicode.UTF8.NewDecoder())
+	br := bufio.NewReader(transform.NewReader(r, tr))
+
 	for {
 		r, _, err := br.ReadRune()
 		if errors.Is(err, io.EOF) {
@@ -85,17 +84,6 @@ func ParseFile(r io.Reader) (*File, error) {
 			return nil, err
 		}
 
-		// the utf16 byte order mark will be read as "unreadable" by ReadRune()
-		if isUnreadable(r) && lineCount == 0 && linePos == 0 {
-			utf16 = true
-			continue
-		}
-
-		// skip the second byte if we're reading utf16
-		if utf16 && r == 0 {
-			continue
-		}
-
 		next, r, err := parseRuneForState(r, curr)
 		if errors.Is(err, io.ErrUnexpectedEOF) {
 			return nil, fmt.Errorf("%w: %s", err, b.String())
@@ -103,13 +91,6 @@ func ParseFile(r io.Reader) (*File, error) {
 			return nil, err
 		}
 
-		if isNewline(r) {
-			lineCount++
-			linePos = 0
-		} else {
-			linePos++
-		}
-
 		// process the state transition, some transitions need to be intercepted and redirected
 		if next != curr {
 			switch curr {
@@ -309,10 +290,6 @@ func isNewline(r rune) bool {
 	return r == '\r' || r == '\n'
 }
 
-func isUnreadable(r rune) bool {
-	return r == unicode.ReplacementChar
-}
-
 func isValidMessageRole(role string) bool {
 	return role == "system" || role == "user" || role == "assistant"
 }