process_text.go 741 B

12345678910111213141516171819202122232425
  1. package mllama
  2. import (
  3. "github.com/ollama/ollama/ml"
  4. "github.com/ollama/ollama/model"
  5. )
  6. type TextProcessor struct {
  7. model.BytePairEncoding
  8. }
  9. func newTextProcessor(c ml.Config) TextProcessor {
  10. return TextProcessor{
  11. BytePairEncoding: model.BytePairEncoding{
  12. Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
  13. Vocabulary: &model.Vocabulary{
  14. Values: c.Strings("tokenizer.ggml.tokens"),
  15. Types: c.Uints("tokenizer.ggml.token_type"),
  16. Merges: c.Strings("tokenizer.ggml.merges"),
  17. BOS: c.Uint("tokenizer.ggml.bos_token_id"),
  18. EOS: c.Uint("tokenizer.ggml.eos_token_id"),
  19. },
  20. },
  21. }
  22. }