11 ヶ月前 · 22f5c12ced
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
 
															-Subproject commit 74f33adf5f8b20b08fc5a6aa17ce081abe86ef2f
														
 
															+Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603
														
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,35 +1,32 @@
 
															-From d02a06f3f45a09255ace8684a66590e06ce44605 Mon Sep 17 00:00:00 2001
														
 
															-From: Michael Yang <mxyng@pm.me>
														
 
															-Date: Thu, 23 May 2024 11:33:20 -0700
														
 
															-Subject: [PATCH] default pretokenizer on unrecognized type
														
 
															-
														
 
															----
														
 
															- llama.cpp | 5 +----
														
 
															- 1 file changed, 1 insertion(+), 4 deletions(-)
														
 
															-
														
 
															 diff --git a/llama.cpp b/llama.cpp
														
 
															-index 15c66077..af1aede3 100644
														
 
															+index 40d2ec2c..74f3ee9c 100644
														
 
															 --- a/llama.cpp
														
 
															 +++ b/llama.cpp
														
 
															-@@ -4504,9 +4504,6 @@ static void llm_load_vocab(
														
 
															-                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
														
 
															-                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
														
 
															-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															--            } else if (
														
 
															--                    tokenizer_pre == "default") {
														
 
															+@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
														
 
															+ 
														
 
															+         // for now, only BPE models have pre-tokenizers
														
 
															+         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
														
 
															+-            if (tokenizer_pre.empty()) {
														
 
															+-                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
														
 
															+-                LLAMA_LOG_WARN("%s:                                             \n", __func__);
														
 
															+-                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
														
 
															+-                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
														
 
															+-                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
														
 
															+-                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
														
 
															+-                LLAMA_LOG_WARN("%s:                                             \n", __func__);
														
 
															 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															+-            } else if (
														
 
															++            if (
														
 
															+                     tokenizer_pre == "default") {
														
 
															+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															              } else if (
														
 
															-                     tokenizer_pre == "llama3"   ||
														
 
															-                     tokenizer_pre == "llama-v3" ||
														
 
															-@@ -4553,7 +4550,7 @@ static void llm_load_vocab(
														
 
															-                 tokenizer_pre == "dbrx") {
														
 
															-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
														
 
															+@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
														
 
															+                 tokenizer_pre == "smaug-bpe") {
														
 
															+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
														
 
															              } else {
														
 
															 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
														
 
															++                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
														
 
															 +                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															              }
														
 
															          } else {
														
 
															              vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															--- 
														
 
															-2.45.1
														
 
															-
	`@@ -1 +1 @@`
	`-Subproject commit 74f33adf5f8b20b08fc5a6aa17ce081abe86ef2f`
			`+Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603`