package model import ( "testing" ) // BenchmarkVocabulary is a reusable test vocabulary for benchmarks var BenchmarkVocabulary = &Vocabulary{ Values: []string{ "Hello", "World", "!", "How", "are", "you", "t", "o", "d", "a", "y", "to", "tod", "toda", "today", " ", "", "", "", "'s", "'t", "'re", "'ve", "'m", "'ll", "'d", }, Types: []uint32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1}, // 3 for special tokens Merges: []string{ "to", "tod", "toda", "today", }, BOS: 16, // EOS: 17, // } func BenchmarkBytePairEncoding(b *testing.B) { bpe := BytePairEncoding{ Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, Vocabulary: BenchmarkVocabulary, } benchmarks := []struct { name string input string }{ { name: "simple_hello_world", input: "Hello World!", }, { name: "with_special_tokens", input: "Hello World!", }, { name: "with_merges", input: "today is today and today", }, { name: "with_contractions", input: "I'm don't won't can't they're we've you'll he'd", }, { name: "long_text", input: "Hello World! How are you today? I'm doing great! This is a longer text to test the performance of the encoding and decoding process with multiple sentences and various tokens including special ones like and and contractions like don't and won't.", }, } for _, bm := range benchmarks { // Benchmark Encoding b.Run("Encode_"+bm.name, func(b *testing.B) { b.ReportAllocs() for range b.N { tokens, err := bpe.Encode(bm.input) if err != nil { b.Fatal(err) } b.SetBytes(int64(len(tokens) * 4)) // Each token is 4 bytes (int32) } }) // First encode the input to get tokens for decode benchmark tokens, err := bpe.Encode(bm.input) if err != nil { b.Fatal(err) } // Benchmark Decoding b.Run("Decode_"+bm.name, func(b *testing.B) { b.ReportAllocs() for range b.N { decoded, err := bpe.Decode(tokens) if err != nil { b.Fatal(err) } b.SetBytes(int64(len(decoded))) } }) } } func BenchmarkBytePairEncodingSplit(b *testing.B) { bpe := BytePairEncoding{ Pretokenizer: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`, } benchmarks := []struct { name string input string }{ { name: "simple_text", input: "Hello World!", }, { name: "with_contractions", input: "I'm don't won't", }, { name: "with_numbers", input: "In 2024 there are 365 days", }, { name: "with_special_chars", input: "Hello!! ...world", }, { name: "with_spaces", input: "Hello World", }, { name: "with_newlines", input: "Hello\nWorld\nHow\nAre\nYou", }, } for _, bm := range benchmarks { b.Run("Split_"+bm.name, func(b *testing.B) { b.ReportAllocs() for range b.N { splits, err := bpe.split(bm.input) if err != nil { b.Fatal(err) } b.SetBytes(int64(len(splits))) } }) } }