9 月之前 · 1475eab95f
--- a/llm/patches/10-tekken.diff
+++ b/llm/patches/10-tekken.diff
@@ -0,0 +1,43 @@
 
				+diff --git a/include/llama.h b/include/llama.h
			
 
				+index bb4b05ba..a92174e0 100644
			
 
				+--- a/include/llama.h
			
 
				++++ b/include/llama.h
			
 
				+@@ -92,6 +92,7 @@ extern "C" {
			
 
				+         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
			
 
				+         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
			
 
				+         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
			
 
				++        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
			
 
				+     };
			
 
				+ 
			
 
				+     // note: these values should be synchronized with ggml_rope
			
 
				+diff --git a/src/llama.cpp b/src/llama.cpp
			
 
				+index 18364976..435b6fe5 100644
			
 
				+--- a/src/llama.cpp
			
 
				++++ b/src/llama.cpp
			
 
				+@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
			
 
				+             } else if (
			
 
				+                 tokenizer_pre == "jais") {
			
 
				+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
			
 
				++            } else if (
			
 
				++                tokenizer_pre == "tekken") {
			
 
				++                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
			
 
				++                vocab.tokenizer_clean_spaces = false;
			
 
				++                vocab.tokenizer_ignore_merges = true;
			
 
				++                vocab.tokenizer_add_bos = true;
			
 
				+             } else {
			
 
				+                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
			
 
				+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
			
 
				+@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
			
 
				+                     " ?[^(\\s|.,!?…。，、।۔،)]+",
			
 
				+                 };
			
 
				+                 break;
			
 
				++            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
			
 
				++                    // original regex from tokenizer.json
			
 
				++                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
			
 
				++                regex_exprs = {
			
 
				++                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
			
 
				++                };
			
 
				++                break;
			
 
				+             default:
			
 
				+                 // default regex for BPE tokenization pre-processing
			
 
				+                 regex_exprs = {