123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- package common
- import (
- "strings"
- )
- func FindStop(sequence string, stops []string) (bool, string) {
- for _, stop := range stops {
- if strings.Contains(sequence, stop) {
- return true, stop
- }
- }
- return false, ""
- }
- func ContainsStopSuffix(sequence string, stops []string) bool {
- for _, stop := range stops {
- for i := 1; i <= len(stop); i++ {
- if strings.HasSuffix(sequence, stop[:i]) {
- return true
- }
- }
- }
- return false
- }
- // truncateStop removes the provided stop string from pieces,
- // returning the partial pieces with stop removed, including truncating
- // the last piece if required (and signalling if this was the case)
- func TruncateStop(pieces []string, stop string) ([]string, bool) {
- joined := strings.Join(pieces, "")
- index := strings.Index(joined, stop)
- if index == -1 {
- return pieces, false
- }
- joined = joined[:index]
- // Split truncated string back into pieces of original lengths
- lengths := make([]int, len(pieces))
- for i, piece := range pieces {
- lengths[i] = len(piece)
- }
- var result []string
- tokenTruncated := false
- start := 0
- for _, length := range lengths {
- if start >= len(joined) {
- break
- }
- end := start + length
- if end > len(joined) {
- end = len(joined)
- tokenTruncated = true
- }
- result = append(result, joined[start:end])
- start = end
- }
- return result, tokenTruncated
- }
- func IncompleteUnicode(token string) bool {
- incomplete := false
- // check if there is incomplete UTF-8 character at the end
- for i := 1; i < 5 && i <= len(token); i++ {
- c := token[len(token)-i]
- if (c & 0xc0) == 0x80 {
- // continuation byte: 10xxxxxx
- continue
- }
- if (c & 0xe0) == 0xc0 {
- // 2-byte character: 110xxxxx ...
- incomplete = i < 2
- } else if (c & 0xf0) == 0xe0 {
- // 3-byte character: 1110xxxx ...
- incomplete = i < 3
- } else if (c & 0xf8) == 0xf0 {
- // 4-byte character: 11110xxx ...
- incomplete = i < 4
- }
- // else 1-byte character or invalid byte
- break
- }
- return incomplete
- }
|