6 月之前 · 099f7077a1
--- a/llama/llama-vocab.cpp
+++ b/llama/llama-vocab.cpp
@@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 
				             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
			
 
				                 regex_exprs = {
			
 
				                     "[\r\n]",
			
 
				-                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
			
 
				+                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
			
 
				                     "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
			
 
				                     "\\s+$",
			
 
				                     "[一-龥ࠀ-一가-퟿]+",
			
--- a/llama/patches/0012-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0012-fix-deepseek-deseret-regex.patch
@@ -0,0 +1,66 @@
 
				+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
			
 
				+From: Daniel Hiltgen <daniel@ollama.com>
			
 
				+Date: Fri, 25 Oct 2024 16:25:18 -0700
			
 
				+Subject: [PATCH] fix deepseek deseret regex
			
 
				+
			
 
				+On windows compiled with gcc the c++ regex library failed to handle
			
 
				+the characters
			
 
				+---
			
 
				+ src/llama-vocab.cpp |  2 +-
			
 
				+ src/unicode.cpp     | 21 +++++++++++++++++++++
			
 
				+ 2 files changed, 22 insertions(+), 1 deletion(-)
			
 
				+
			
 
				+diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
			
 
				+index d2f34ddd..3ef6af19 100644
			
 
				+--- a/src/llama-vocab.cpp
			
 
				++++ b/src/llama-vocab.cpp
			
 
				+@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
			
 
				+             case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
			
 
				+                 regex_exprs = {
			
 
				+                     "[\r\n]",
			
 
				+-                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
			
 
				++                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ\U00010400-\U0001044f𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
			
 
				+                     "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
			
 
				+                     "\\s+$",
			
 
				+                     "[一-龥ࠀ-一가-퟿]+",
			
 
				+diff --git a/src/unicode.cpp b/src/unicode.cpp
			
 
				+index f4e941cd..9d78ff16 100644
			
 
				+--- a/src/unicode.cpp
			
 
				++++ b/src/unicode.cpp
			
 
				+@@ -2,6 +2,11 @@
			
 
				+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
			
 
				+ #endif
			
 
				+ 
			
 
				++#if defined(_WIN32)
			
 
				++#define WIN32_LEAN_AND_MEAN
			
 
				++#include <windows.h>
			
 
				++#endif
			
 
				++
			
 
				+ #include "unicode.h"
			
 
				+ #include "unicode-data.h"
			
 
				+ 
			
 
				+@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
			
 
				+ }
			
 
				+ 
			
 
				+ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
			
 
				++#ifdef _WIN32
			
 
				++    int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
			
 
				++    if (!wlen) {
			
 
				++        throw std::invalid_argument("failed to convert regex");
			
 
				++    }
			
 
				++    wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
			
 
				++    wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
			
 
				++    if (!wlen) {
			
 
				++        free(wbuf);
			
 
				++        throw std::invalid_argument("failed to convert regex");
			
 
				++    }
			
 
				++    std::wstring ret = std::wstring(wbuf);
			
 
				++    free(wbuf);
			
 
				++    return ret;
			
 
				++#else
			
 
				+     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
			
 
				+     return conv.from_bytes(s);
			
 
				++#endif
			
 
				+ }
			
 
				+ 
			
 
				+ static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
			
--- a/llama/unicode.cpp
+++ b/llama/unicode.cpp
@@ -28,6 +28,11 @@
 
				 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
			
 
				 #endif
			
 
				 
			
 
				+#if defined(_WIN32)
			
 
				+#define WIN32_LEAN_AND_MEAN
			
 
				+#include <windows.h>
			
 
				+#endif
			
 
				+
			
 
				 #include "unicode.h"
			
 
				 #include "unicode-data.h"
			
 
				 
			
@@ -227,8 +232,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
 
				 }
			
 
				 
			
 
				 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
			
 
				+#ifdef _WIN32
			
 
				+    int wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, NULL, 0);
			
 
				+    if (!wlen) {
			
 
				+        throw std::invalid_argument("failed to convert regex");
			
 
				+    }
			
 
				+    wchar_t * wbuf = (wchar_t *) malloc(wlen * sizeof(wchar_t));
			
 
				+    wlen = MultiByteToWideChar(CP_UTF8, 0, s.c_str(), -1, wbuf, wlen);
			
 
				+    if (!wlen) {
			
 
				+        free(wbuf);
			
 
				+        throw std::invalid_argument("failed to convert regex");
			
 
				+    }
			
 
				+    std::wstring ret = std::wstring(wbuf);
			
 
				+    free(wbuf);
			
 
				+    return ret;
			
 
				+#else
			
 
				     std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
			
 
				     return conv.from_bytes(s);
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {