|
@@ -0,0 +1,66 @@
|
|
|
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
|
+From: Daniel Hiltgen <daniel@ollama.com>
|
|
|
+Date: Fri, 25 Oct 2024 16:25:18 -0700
|
|
|
+Subject: [PATCH] fix deepseek deseret regex
|
|
|
+
|
|
|
+On windows compiled with gcc the c++ regex library failed to handle
|
|
|
+the characters
|
|
|
+---
|
|
|
+ src/llama-vocab.cpp | 2 +-
|
|
|
+ src/unicode.cpp | 21 +++++++++++++++++++++
|
|
|
+ 2 files changed, 22 insertions(+), 1 deletion(-)
|
|
|
+
|
|
|
+diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
|
|
+index d2f34ddd..3ef6af19 100644
|
|
|
+--- a/src/llama-vocab.cpp
|
|
|
++++ b/src/llama-vocab.cpp
|
|
|
+@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
|
|
+ regex_exprs = {
|
|
|
+ "[\r\n]",
|
|
|
+- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
|
|
++ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
|
|
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
|
|
+ "\\s+$",
|
|
|
+ "[一-龥ࠀ-一가-]+",
|
|
|
+diff --git a/src/unicode.cpp b/src/unicode.cpp
|
|
|
+index f4e941cd..9d78ff16 100644
|
|
|
+--- a/src/unicode.cpp
|
|
|
++++ b/src/unicode.cpp
|
|
|
+@@ -2,6 +2,11 @@
|
|
|
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
|
|
+ #endif
|
|
|
+
|
|
|
++#if defined(_WIN32)
|
|
|
++#define WIN32_LEAN_AND_MEAN
|
|
|
++#include <windows.h>
|
|
|
++#endif
|
|
|
++
|
|
|
+ #include "unicode.h"
|
|
|
+ #include "unicode-data.h"
|
|
|
+
|
|
|
+@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
|
|
+ }
|
|
|
+
|
|
|
+ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
|
|
++#ifdef _WIN32
|
|
|
++ int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
|
|
|
++ if (!wlen) {
|
|
|
++ throw std::invalid_argument("failed to convert regex");
|
|
|
++ }
|
|
|
++ wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
|
|
|
++ wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
|
|
|
++ if (!wlen) {
|
|
|
++ free(wbuf);
|
|
|
++ throw std::invalid_argument("failed to convert regex");
|
|
|
++ }
|
|
|
++ std::wstring ret = std::wstring(wbuf);
|
|
|
++ free(wbuf);
|
|
|
++ return ret;
|
|
|
++#else
|
|
|
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
|
|
+ return conv.from_bytes(s);
|
|
|
++#endif
|
|
|
+ }
|
|
|
+
|
|
|
+ static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
|