0009-fix-deepseek-deseret-regex.patch 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
  2. From: Daniel Hiltgen <daniel@ollama.com>
  3. Date: Fri, 25 Oct 2024 16:25:18 -0700
  4. Subject: [PATCH] fix deepseek deseret regex
  5. On windows compiled with gcc the c++ regex library failed to handle
  6. the characters
  7. ---
  8. src/llama-vocab.cpp | 2 +-
  9. src/unicode.cpp | 22 ++++++++++++++++++++++
  10. 2 files changed, 23 insertions(+), 1 deletion(-)
  11. diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
  12. index a4eee9b8..1ca827eb 100644
  13. --- a/src/llama-vocab.cpp
  14. +++ b/src/llama-vocab.cpp
  15. @@ -295,7 +295,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
  16. case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
  17. regex_exprs = {
  18. "[\r\n]",
  19. - "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
  20. + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
  21. "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
  22. "\\s+$",
  23. "[一-龥ࠀ-一가-퟿]+",
  24. diff --git a/src/unicode.cpp b/src/unicode.cpp
  25. index e63bb4ab..9dd53b9a 100644
  26. --- a/src/unicode.cpp
  27. +++ b/src/unicode.cpp
  28. @@ -2,6 +2,11 @@
  29. #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
  30. #endif
  31. +#if defined(_WIN32)
  32. +#define WIN32_LEAN_AND_MEAN
  33. +#include <windows.h>
  34. +#endif
  35. +
  36. #include "unicode.h"
  37. #include "unicode-data.h"
  38. @@ -200,6 +205,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
  39. }
  40. static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
  41. +#ifdef _WIN32
  42. + int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
  43. + if (!wlen) {
  44. + throw std::invalid_argument("failed to convert regex");
  45. + }
  46. + wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
  47. + wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
  48. + if (!wlen) {
  49. + free(wbuf);
  50. + throw std::invalid_argument("failed to convert regex");
  51. + }
  52. + std::wstring ret = std::wstring(wbuf);
  53. + free(wbuf);
  54. + return ret;
  55. +#else
  56. +
  57. #if defined(__clang__)
  58. // disable C++17 deprecation warning for std::codecvt_utf8
  59. # pragma clang diagnostic push
  60. @@ -213,6 +234,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
  61. #endif
  62. return conv.from_bytes(s);
  63. +#endif
  64. }
  65. static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {