stop.go 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. package common
  2. import (
  3. "strings"
  4. )
  5. func FindStop(sequence string, stops []string) (bool, string) {
  6. for _, stop := range stops {
  7. if strings.Contains(sequence, stop) {
  8. return true, stop
  9. }
  10. }
  11. return false, ""
  12. }
  13. func ContainsStopSuffix(sequence string, stops []string) bool {
  14. for _, stop := range stops {
  15. for i := 1; i <= len(stop); i++ {
  16. if strings.HasSuffix(sequence, stop[:i]) {
  17. return true
  18. }
  19. }
  20. }
  21. return false
  22. }
  23. // truncateStop removes the provided stop string from pieces,
  24. // returning the partial pieces with stop removed, including truncating
  25. // the last piece if required (and signalling if this was the case)
  26. func TruncateStop(pieces []string, stop string) ([]string, bool) {
  27. joined := strings.Join(pieces, "")
  28. index := strings.Index(joined, stop)
  29. if index == -1 {
  30. return pieces, false
  31. }
  32. joined = joined[:index]
  33. // Split truncated string back into pieces of original lengths
  34. lengths := make([]int, len(pieces))
  35. for i, piece := range pieces {
  36. lengths[i] = len(piece)
  37. }
  38. var result []string
  39. tokenTruncated := false
  40. start := 0
  41. for _, length := range lengths {
  42. if start >= len(joined) {
  43. break
  44. }
  45. end := start + length
  46. if end > len(joined) {
  47. end = len(joined)
  48. tokenTruncated = true
  49. }
  50. result = append(result, joined[start:end])
  51. start = end
  52. }
  53. return result, tokenTruncated
  54. }
  55. func IncompleteUnicode(token string) bool {
  56. incomplete := false
  57. // check if there is incomplete UTF-8 character at the end
  58. for i := 1; i < 5 && i <= len(token); i++ {
  59. c := token[len(token)-i]
  60. if (c & 0xc0) == 0x80 {
  61. // continuation byte: 10xxxxxx
  62. continue
  63. }
  64. if (c & 0xe0) == 0xc0 {
  65. // 2-byte character: 110xxxxx ...
  66. incomplete = i < 2
  67. } else if (c & 0xf0) == 0xe0 {
  68. // 3-byte character: 1110xxxx ...
  69. incomplete = i < 3
  70. } else if (c & 0xf8) == 0xf0 {
  71. // 4-byte character: 11110xxx ...
  72. incomplete = i < 4
  73. }
  74. // else 1-byte character or invalid byte
  75. break
  76. }
  77. return incomplete
  78. }