10-tekken.diff 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. diff --git a/include/llama.h b/include/llama.h
  2. index bb4b05ba..a92174e0 100644
  3. --- a/include/llama.h
  4. +++ b/include/llama.h
  5. @@ -92,6 +92,7 @@ extern "C" {
  6. LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
  7. LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
  8. LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
  9. + LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
  10. };
  11. // note: these values should be synchronized with ggml_rope
  12. diff --git a/src/llama.cpp b/src/llama.cpp
  13. index 18364976..435b6fe5 100644
  14. --- a/src/llama.cpp
  15. +++ b/src/llama.cpp
  16. @@ -5429,6 +5429,12 @@ static void llm_load_vocab(
  17. } else if (
  18. tokenizer_pre == "jais") {
  19. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
  20. + } else if (
  21. + tokenizer_pre == "tekken") {
  22. + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
  23. + vocab.tokenizer_clean_spaces = false;
  24. + vocab.tokenizer_ignore_merges = true;
  25. + vocab.tokenizer_add_bos = true;
  26. } else {
  27. LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
  28. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  29. @@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
  30. " ?[^(\\s|.,!?…。,、।۔،)]+",
  31. };
  32. break;
  33. + case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
  34. + // original regex from tokenizer.json
  35. + // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
  36. + regex_exprs = {
  37. + "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
  38. + };
  39. + break;
  40. default:
  41. // default regex for BPE tokenization pre-processing
  42. regex_exprs = {