|
@@ -1,11 +1,11 @@
|
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
-index 73f52435..2b81b4bd 100644
|
|
|
|
|
|
+index 2b9ace28..172640e2 100644
|
|
--- a/src/llama.cpp
|
|
--- a/src/llama.cpp
|
|
+++ b/src/llama.cpp
|
|
+++ b/src/llama.cpp
|
|
-@@ -5092,16 +5092,7 @@ static void llm_load_vocab(
|
|
|
|
-
|
|
|
|
- // for now, only BPE models have pre-tokenizers
|
|
|
|
|
|
+@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
|
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
|
|
|
+ vocab.tokenizer_add_space_prefix = false;
|
|
|
|
+ vocab.tokenizer_clean_spaces = true;
|
|
- if (tokenizer_pre.empty()) {
|
|
- if (tokenizer_pre.empty()) {
|
|
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
|
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
|
- LLAMA_LOG_WARN("%s: \n", __func__);
|
|
- LLAMA_LOG_WARN("%s: \n", __func__);
|
|
@@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644
|
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
} else if (
|
|
} else if (
|
|
tokenizer_pre == "llama3" ||
|
|
tokenizer_pre == "llama3" ||
|
|
-@@ -5164,7 +5155,8 @@ static void llm_load_vocab(
|
|
|
|
|
|
+@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
|
|
tokenizer_pre == "jais") {
|
|
tokenizer_pre == "jais") {
|
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
|
} else {
|
|
} else {
|