1 mēnesi atpakaļ · e093db92c4
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -245,6 +245,20 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 
				 	return &m, nil
			
 
				 }
			
 
				 
			
 
				+func LoadVocabFromFile(path string) (*Vocab, error) {
			
 
				+	mp := C.CString(path)
			
 
				+	defer C.free(unsafe.Pointer(mp))
			
 
				+	v := Vocab{c: C.llama_load_vocab_from_file(mp)}
			
 
				+	if v.c == nil {
			
 
				+		return nil, fmt.Errorf("unable to load vocab: %s", path)
			
 
				+	}
			
 
				+	return &v, nil
			
 
				+}
			
 
				+
			
 
				+func FreeVocab(vocab *Vocab) {
			
 
				+	C.llama_free_vocab(vocab.c)
			
 
				+}
			
 
				+
			
 
				 func FreeModel(model *Model) {
			
 
				 	C.llama_model_free(model.c)
			
 
				 }
			
@@ -293,6 +307,10 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				+type Vocab struct {
			
 
				+	c *C.struct_llama_vocab
			
 
				+}
			
 
				+
			
 
				 func (m *Model) Vocab() *C.struct_llama_vocab {
			
 
				 	return C.llama_model_get_vocab(m.c)
			
 
				 }
			
@@ -669,3 +687,53 @@ func SchemaToGrammar(schema []byte) []byte {
 
				 	}
			
 
				 	return buf[:n]
			
 
				 }
			
 
				+
			
 
				+type Sampler struct {
			
 
				+	c *C.struct_llama_sampler
			
 
				+}
			
 
				+
			
 
				+func NewGrammarSampler(vocab *Vocab, grammar string) *Sampler {
			
 
				+	cGrammar := C.CString(grammar)
			
 
				+	cRoot := C.CString("root")
			
 
				+	defer C.free(unsafe.Pointer(cGrammar))
			
 
				+	defer C.free(unsafe.Pointer(cRoot))
			
 
				+
			
 
				+	sampler := &Sampler{c: C.llama_sampler_init_grammar(vocab.c, cGrammar, cRoot)}
			
 
				+
			
 
				+	return sampler
			
 
				+}
			
 
				+
			
 
				+func (s *Sampler) Accept(token int32) {
			
 
				+	C.llama_sampler_accept(s.c, C.llama_token(token))
			
 
				+}
			
 
				+
			
 
				+type TokenData struct {
			
 
				+	Id    int32
			
 
				+	Logit float32
			
 
				+}
			
 
				+
			
 
				+func (s *Sampler) Apply(tokens []TokenData) {
			
 
				+	tds := make([]C.struct_llama_token_data, len(tokens))
			
 
				+	for i, token := range tokens {
			
 
				+		tds[i] = C.struct_llama_token_data{
			
 
				+			id:    C.int32_t(token.Id),
			
 
				+			logit: C.float(token.Logit),
			
 
				+			p:     C.float(0.0),
			
 
				+		}
			
 
				+	}
			
 
				+	tda := &C.llama_token_data_array{
			
 
				+		data:     (*C.struct_llama_token_data)(unsafe.Pointer(&tds[0])),
			
 
				+		size:     C.size_t(len(tokens)),
			
 
				+		selected: C.int64_t(-1),
			
 
				+		sorted:   C.bool(false),
			
 
				+	}
			
 
				+
			
 
				+	var pinner runtime.Pinner
			
 
				+	pinner.Pin(&tds[0])
			
 
				+	defer pinner.Unpin()
			
 
				+
			
 
				+	C.llama_sampler_apply(s.c, tda)
			
 
				+	for i := range tokens {
			
 
				+		tokens[i].Logit = float32(tds[i].logit)
			
 
				+	}
			
 
				+}
			
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -2,6 +2,9 @@
 
				 #include "sampling.h"
			
 
				 #include "sampling_ext.h"
			
 
				 #include "json-schema-to-grammar.h"
			
 
				+#include "llama.h"
			
 
				+#include "llama-model.h"
			
 
				+#include "llama-model-loader.h"
			
 
				 
			
 
				 struct common_sampler *common_sampler_cinit(const struct llama_model *model, struct common_sampler_cparams *params) {
			
 
				     try {
			
@@ -64,3 +67,22 @@ int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len)
 
				         return 0;
			
 
				     }
			
 
				 }
			
 
				+
			
 
				+struct llama_vocab * llama_load_vocab_from_file(const char * fname) {
			
 
				+    llama_vocab * vocab = new llama_vocab();
			
 
				+    try {
			
 
				+        const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
			
 
				+        std::vector<std::string> splits = {};
			
 
				+        llama_model_loader ml(std::string(fname), splits, false, false, nullptr);
			
 
				+        vocab->load(ml, kv);
			
 
				+    } catch (const std::exception & err) {
			
 
				+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
			
 
				+        return nullptr;
			
 
				+    }
			
 
				+
			
 
				+    return vocab;
			
 
				+}
			
 
				+
			
 
				+void llama_free_vocab(struct llama_vocab * vocab) {
			
 
				+    delete vocab;
			
 
				+}
			
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@@ -35,6 +35,9 @@ extern "C"
 
				 
			
 
				     int schema_to_grammar(const char *json_schema, char *grammar, size_t max_len);
			
 
				 
			
 
				+    struct llama_vocab * llama_load_vocab_from_file(const char * fname);
			
 
				+    void llama_free_vocab(struct llama_vocab * vocab);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -729,29 +729,24 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 
				 	}
			
 
				 
			
 
				 	if len(req.Format) > 0 {
			
 
				-		format := string(req.Format)
			
 
				-		if format != `null` && format != `""` {
			
 
				-			if s.textProcessor != nil {
			
 
				-				// New engine handles this on the backend
			
 
				-				request["format"] = req.Format
			
 
				-			} else {
			
 
				-				// old engine
			
 
				-				switch format {
			
 
				-				case `"json"`:
			
 
				-					request["grammar"] = grammarJSON
			
 
				-				default:
			
 
				-					if req.Format[0] != '{' {
			
 
				-						return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
			
 
				-					}
			
 
				+		switch string(req.Format) {
			
 
				+		case `null`, `""`:
			
 
				+			// Field was set, but "missing" a value. We accept
			
 
				+			// these as "not set".
			
 
				+			break
			
 
				+		case `"json"`:
			
 
				+			request["grammar"] = grammarJSON
			
 
				+		default:
			
 
				+			if req.Format[0] != '{' {
			
 
				+				return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
			
 
				+			}
			
 
				 
			
 
				-					// User provided a JSON schema
			
 
				-					g := llama.SchemaToGrammar(req.Format)
			
 
				-					if g == nil {
			
 
				-						return fmt.Errorf("invalid JSON schema in format")
			
 
				-					}
			
 
				-					request["grammar"] = string(g)
			
 
				-				}
			
 
				+			// User provided a JSON schema
			
 
				+			g := llama.SchemaToGrammar(req.Format)
			
 
				+			if g == nil {
			
 
				+				return fmt.Errorf("invalid JSON schema in format")
			
 
				 			}
			
 
				+			request["grammar"] = string(g)
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -254,6 +254,12 @@ type Server struct {
 
				 	// multimodalHash generates hashes for comparing equality
			
 
				 	// of non-text data
			
 
				 	multimodalHash maphash.Hash
			
 
				+
			
 
				+	// vocab is a llama.cpp vocab required for gammar-based
			
 
				+	// constrained generation (json mode, structured outputs)
			
 
				+	// TODO: this is temporary until Ollama sampling supports
			
 
				+	// constrained generation
			
 
				+	vocab *sample.Vocab
			
 
				 }
			
 
				 
			
 
				 func (s *Server) allNil() bool {
			
@@ -574,18 +580,25 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				+	var grammar *sample.Grammar
			
 
				+	var err error
			
 
				+	if req.Grammar != "" {
			
 
				+		grammar, err = sample.NewGrammar(s.vocab, req.Grammar)
			
 
				+		if err != nil {
			
 
				+			http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
			
 
				+			return
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	sampler := sample.NewSampler(
			
 
				 		req.Temperature,
			
 
				 		req.TopK,
			
 
				 		req.TopP,
			
 
				 		req.MinP,
			
 
				 		req.Seed,
			
 
				+		grammar,
			
 
				 	)
			
 
				 
			
 
				-	if req.Grammar != "" {
			
 
				-		panic("grammars are not yet supported")
			
 
				-	}
			
 
				-
			
 
				 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
			
 
				 		numPredict: req.NumPredict,
			
 
				 		stop:       req.Stop,
			
@@ -797,6 +810,8 @@ func (s *Server) loadModel(
 
				 		panic(err)
			
 
				 	}
			
 
				 
			
 
				+	s.vocab = sample.NewVocab(mpath)
			
 
				+
			
 
				 	// TODO(jessegross): LoRA loading
			
 
				 	if lpath.String() != "" {
			
 
				 		panic("loras are not yet implemented")
			
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -2,43 +2,88 @@ package sample
 
				 
			
 
				 import (
			
 
				 	"errors"
			
 
				+	"math"
			
 
				 	"math/rand/v2"
			
 
				 	"slices"
			
 
				-)
			
 
				+	"sync"
			
 
				 
			
 
				-// Sampler is not thread-safe. Each goroutine should have its own instance
			
 
				-type Sampler interface {
			
 
				-	Sample([]float32) (int32, error)
			
 
				-}
			
 
				+	"github.com/ollama/ollama/llama"
			
 
				+)
			
 
				 
			
 
				-// logit represents information about a single token during sampling
			
 
				-type logit struct {
			
 
				+// token represents information about a single token during sampling
			
 
				+type token struct {
			
 
				 	id    int32   // The token's unique identifier
			
 
				 	value float32 // The raw logit or probability from the model
			
 
				 }
			
 
				 
			
 
				-type weighted struct {
			
 
				+type Sampler struct {
			
 
				 	rng         *rand.Rand
			
 
				-	tokens      []logit
			
 
				 	topK        int
			
 
				 	topP        float32
			
 
				 	minP        float32
			
 
				 	temperature float32
			
 
				+	grammar     *Grammar
			
 
				 }
			
 
				 
			
 
				-func (s *weighted) Sample(logits []float32) (int32, error) {
			
 
				-	if len(s.tokens) < len(logits) {
			
 
				-		s.tokens = make([]logit, len(logits))
			
 
				+func (s *Sampler) Sample(logits []float32) (int32, error) {
			
 
				+	tokens := make([]token, len(logits))
			
 
				+	for i := range logits {
			
 
				+		tokens[i].id = int32(i)
			
 
				+		tokens[i].value = logits[i]
			
 
				 	}
			
 
				 
			
 
				-	tokens := s.tokens[:len(logits)]
			
 
				+	t, err := s.sample(tokens)
			
 
				+	if err != nil {
			
 
				+		return -1, err
			
 
				+	}
			
 
				 
			
 
				-	for i, v := range logits {
			
 
				-		tokens[i].id = int32(i)
			
 
				-		tokens[i].value = v
			
 
				+	if s.grammar != nil {
			
 
				+		// optimization: first check if the max logit is accepted by the grammar
			
 
				+		// if the max logit is rejected, apply the grammar to all logits (slower)
			
 
				+		top := []token{t}
			
 
				+		s.grammar.Apply(top)
			
 
				+		if !math.IsInf(float64(top[0].value), -1) {
			
 
				+			s.grammar.Accept(top[0].id)
			
 
				+			return top[0].id, nil
			
 
				+		}
			
 
				+
			
 
				+		// since .sample has side effects of modifying the tokens
			
 
				+		// we need to reset them before applying the grammar and
			
 
				+		// sampling again
			
 
				+		for i := range logits {
			
 
				+			tokens[i].id = int32(i)
			
 
				+			tokens[i].value = logits[i]
			
 
				+		}
			
 
				+		s.grammar.Apply(tokens)
			
 
				+		t, err = s.sample(tokens)
			
 
				+		if err != nil {
			
 
				+			return -1, err
			
 
				+		}
			
 
				+		s.grammar.Accept(t.id)
			
 
				+	}
			
 
				+
			
 
				+	return t.id, nil
			
 
				+}
			
 
				+
			
 
				+// greedy returns the highest probability token from the tokens
			
 
				+func greedy(tokens []token) token {
			
 
				+	max := tokens[0]
			
 
				+	for i := 1; i < len(tokens); i++ {
			
 
				+		if tokens[i].value > max.value {
			
 
				+			max = tokens[i]
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return max
			
 
				+}
			
 
				+
			
 
				+// sample returns the highest probability token from the tokens
			
 
				+// given sampler parameters. It also has side effects of modifying the tokens
			
 
				+func (s *Sampler) sample(tokens []token) (token, error) {
			
 
				+	if s.temperature == 0 {
			
 
				+		return greedy(tokens), nil
			
 
				 	}
			
 
				 
			
 
				-	// Tokens are sorted by logits in TopK or SortTokens
			
 
				 	if s.topK > 0 {
			
 
				 		tokens = topK(tokens, s.topK)
			
 
				 	} else {
			
@@ -47,12 +92,14 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
 
				 
			
 
				 	tokens = temperature(tokens, s.temperature)
			
 
				 	tokens = softmax(tokens)
			
 
				-
			
 
				 	tokens = topP(tokens, s.topP)
			
 
				 	tokens = minP(tokens, s.minP)
			
 
				 
			
 
				+	// TODO: this should fall back to greedy sampling
			
 
				+	// or topP, topK values etc should be such that
			
 
				+	// there are always tokens to sample from
			
 
				 	if len(tokens) == 0 {
			
 
				-		return -1, errors.New("no valid logits found for weighted sampling")
			
 
				+		return token{}, errors.New("no tokens to sample from")
			
 
				 	}
			
 
				 
			
 
				 	var r float32
			
@@ -70,48 +117,18 @@ func (s *weighted) Sample(logits []float32) (int32, error) {
 
				 	}
			
 
				 	r *= tokens[len(tokens)-1].value
			
 
				 
			
 
				-	idx, _ := slices.BinarySearchFunc(tokens, r, func(token logit, target float32) int {
			
 
				-		// Compare cumulative probabilities
			
 
				+	idx, _ := slices.BinarySearchFunc(tokens, r, func(token token, target float32) int {
			
 
				 		if token.value < target {
			
 
				 			return -1
			
 
				 		}
			
 
				-		// First token that exceeds target
			
 
				 		return 1
			
 
				 	})
			
 
				 
			
 
				-	if idx >= len(tokens) {
			
 
				-		idx = len(tokens) - 1
			
 
				-	}
			
 
				-
			
 
				-	return tokens[idx].id, nil
			
 
				-}
			
 
				-
			
 
				-type greedy struct{}
			
 
				-
			
 
				-// Greedy sample returns the index of the maximum value in logits.
			
 
				-func (s greedy) Sample(logits []float32) (int32, error) {
			
 
				-	if len(logits) == 0 {
			
 
				-		return -1, errors.New("no logits provided for greedy sampling")
			
 
				-	}
			
 
				-
			
 
				-	maxIdx := 0
			
 
				-	maxVal := logits[0]
			
 
				-	for i := 1; i < len(logits); i++ {
			
 
				-		if logits[i] > maxVal {
			
 
				-			maxVal = logits[i]
			
 
				-			maxIdx = i
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return int32(maxIdx), nil
			
 
				+	return tokens[idx], nil
			
 
				 }
			
 
				 
			
 
				 // TODO(parthsareen): update sampler interface to use json unmarshal https://github.com/ollama/ollama/issues/9278
			
 
				-func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int) Sampler {
			
 
				-	if temperature == 0 {
			
 
				-		return &greedy{}
			
 
				-	}
			
 
				-
			
 
				+func NewSampler(temperature float32, topK int, topP float32, minP float32, seed int, grammar *Grammar) Sampler {
			
 
				 	var rng *rand.Rand
			
 
				 	if seed != -1 {
			
 
				 		// PCG requires two parameters: sequence and stream
			
@@ -120,7 +137,9 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 
				 		// Use golden ratio hash to generate statistically independent seeds
			
 
				 		rng = rand.New(rand.NewPCG(sequence, sequence^0x9E3779B9))
			
 
				 	}
			
 
				-	temperature = max(temperature, 1)
			
 
				+	if temperature < 0.0 {
			
 
				+		temperature = 0.0
			
 
				+	}
			
 
				 
			
 
				 	if topP < 0.0 {
			
 
				 		topP = 0.0
			
@@ -136,11 +155,73 @@ func NewSampler(temperature float32, topK int, topP float32, minP float32, seed
 
				 		minP = 1.0
			
 
				 	}
			
 
				 
			
 
				-	return &weighted{
			
 
				+	return Sampler{
			
 
				 		rng:         rng,
			
 
				 		topK:        topK,
			
 
				 		topP:        topP,
			
 
				 		minP:        minP,
			
 
				 		temperature: temperature,
			
 
				+		grammar:     grammar,
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+type Grammar struct {
			
 
				+	vocab   *Vocab
			
 
				+	grammar string
			
 
				+	sampler *llama.Sampler
			
 
				+}
			
 
				+
			
 
				+func NewGrammar(vocab *Vocab, grammar string) (*Grammar, error) {
			
 
				+	v, err := vocab.Load()
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	return &Grammar{
			
 
				+		vocab:   vocab,
			
 
				+		grammar: grammar,
			
 
				+		sampler: llama.NewGrammarSampler(v, grammar),
			
 
				+	}, nil
			
 
				+}
			
 
				+
			
 
				+func (g *Grammar) Apply(tokens []token) {
			
 
				+	tds := make([]llama.TokenData, len(tokens))
			
 
				+	for i, token := range tokens {
			
 
				+		tds[i].Id = token.id
			
 
				+		tds[i].Logit = token.value
			
 
				 	}
			
 
				+
			
 
				+	g.sampler.Apply(tds)
			
 
				+
			
 
				+	for i := range tokens {
			
 
				+		tokens[i].value = tds[i].Logit
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (g *Grammar) Accept(token int32) {
			
 
				+	g.sampler.Accept(token)
			
 
				+}
			
 
				+
			
 
				+type Vocab struct {
			
 
				+	once  sync.Once
			
 
				+	vocab *llama.Vocab
			
 
				+	err   error
			
 
				+	path  string
			
 
				+}
			
 
				+
			
 
				+func NewVocab(path string) *Vocab {
			
 
				+	return &Vocab{path: path}
			
 
				+}
			
 
				+
			
 
				+// Load returns the lazily-loaded vocabulary
			
 
				+func (v *Vocab) Load() (*llama.Vocab, error) {
			
 
				+	v.once.Do(func() {
			
 
				+		vocab, err := llama.LoadVocabFromFile(v.path)
			
 
				+		if err != nil {
			
 
				+			v.err = err
			
 
				+			return
			
 
				+		}
			
 
				+		v.vocab = vocab
			
 
				+	})
			
 
				+	return v.vocab, v.err
			
 
				 }
			
--- a/sample/samplers_benchmark_test.go
+++ b/sample/samplers_benchmark_test.go
@@ -16,13 +16,10 @@ func BenchmarkWeightedSampler(b *testing.B) {
 
				 				logits[i] = float32(rand.Float64()*10 - 5)
			
 
				 			}
			
 
				 
			
 
				-			sampler := NewSampler(0.8, 0, 0, 0, 42)
			
 
				+			sampler := NewSampler(0.8, 0, 0, 0, 42, nil)
			
 
				 			b.ResetTimer()
			
 
				 			for b.Loop() {
			
 
				-				_, err := sampler.Sample(logits)
			
 
				-				if err != nil {
			
 
				-					b.Fatalf("Sampling failed: %v", err)
			
 
				-				}
			
 
				+				sampler.Sample(logits)
			
 
				 			}
			
 
				 		})
			
 
				 	}
			
@@ -52,30 +49,24 @@ func BenchmarkWeightedSampler(b *testing.B) {
 
				 
			
 
				 	for _, tc := range configs {
			
 
				 		b.Run("Config"+tc.name, func(b *testing.B) {
			
 
				-			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed)
			
 
				+			sampler := NewSampler(tc.temperature, tc.topK, tc.topP, tc.minP, tc.seed, nil)
			
 
				 			sampler.Sample(logits)
			
 
				 
			
 
				 			b.ResetTimer()
			
 
				 
			
 
				 			for b.Loop() {
			
 
				-				_, err := sampler.Sample(logits)
			
 
				-				if err != nil {
			
 
				-					b.Fatalf("Sampling failed: %v", err)
			
 
				-				}
			
 
				+				sampler.Sample(logits)
			
 
				 			}
			
 
				 		})
			
 
				 	}
			
 
				 
			
 
				 	// Test with combined transforms separately - topK influences performance greatly
			
 
				 	b.Run("TransformCombined", func(b *testing.B) {
			
 
				-		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42)
			
 
				+		sampler := NewSampler(0.8, 50, 0.9, 0.05, 42, nil)
			
 
				 		b.ResetTimer()
			
 
				 
			
 
				 		for b.Loop() {
			
 
				-			_, err := sampler.Sample(logits)
			
 
				-			if err != nil {
			
 
				-				b.Fatalf("Sampling failed: %v", err)
			
 
				-			}
			
 
				+			sampler.Sample(logits)
			
 
				 		}
			
 
				 	})
			
 
				 }
			
@@ -90,14 +81,11 @@ func BenchmarkGreedySampler(b *testing.B) {
 
				 				logits[i] = float32(rand.Float64()*10 - 5)
			
 
				 			}
			
 
				 
			
 
				-			sampler := NewSampler(0, -1, 0, 0, -1)
			
 
				+			sampler := NewSampler(0, -1, 0, 0, -1, nil)
			
 
				 			b.ResetTimer()
			
 
				 
			
 
				 			for b.Loop() {
			
 
				-				_, err := sampler.Sample(logits)
			
 
				-				if err != nil {
			
 
				-					b.Fatalf("Sampling failed: %v", err)
			
 
				-				}
			
 
				+				sampler.Sample(logits)
			
 
				 			}
			
 
				 		})
			
 
				 	}
			
--- a/sample/samplers_test.go
+++ b/sample/samplers_test.go
@@ -7,7 +7,7 @@ import (
 
				 
			
 
				 func TestWeighted(t *testing.T) {
			
 
				 	logits := []float32{-10, 3, -10, -10}
			
 
				-	sampler := NewSampler(0, 0, 0, 0, 0)
			
 
				+	sampler := NewSampler(0, 0, 0, 0, 0, nil)
			
 
				 	got, err := sampler.Sample(logits)
			
 
				 	if err != nil {
			
 
				 		t.Error(err)
			
@@ -19,7 +19,7 @@ func TestWeighted(t *testing.T) {
 
				 	}
			
 
				 
			
 
				 	logits = []float32{-100, -10, 0, 10}
			
 
				-	sampler = NewSampler(0, 0, 0, 0, 0)
			
 
				+	sampler = NewSampler(0, 0, 0, 0, 0, nil)
			
 
				 	got, err = sampler.Sample(logits)
			
 
				 	if err != nil {
			
 
				 		t.Error(err)
			
@@ -31,94 +31,10 @@ func TestWeighted(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func TestNewSampler(t *testing.T) {
			
 
				-	tests := []struct {
			
 
				-		name        string
			
 
				-		temperature float32
			
 
				-		topK        int
			
 
				-		topP        float32
			
 
				-		minP        float32
			
 
				-		seed        int
			
 
				-		wantGreedy  bool // Instead of wantErr, check if we get greedy sampler
			
 
				-	}{
			
 
				-		{
			
 
				-			name:        "temperature",
			
 
				-			temperature: 0.5,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "zero temperature - greedy",
			
 
				-			temperature: 0,
			
 
				-			wantGreedy:  true,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "top k",
			
 
				-			temperature: 0.1,
			
 
				-			topK:        10,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "top p",
			
 
				-			temperature: 0.1,
			
 
				-			topP:        0.9,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "min p",
			
 
				-			temperature: 0.1,
			
 
				-			minP:        0.2,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "seed - weighted",
			
 
				-			temperature: 0.1,
			
 
				-			seed:        42,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "default values",
			
 
				-			temperature: 0.8,
			
 
				-			topK:        40,
			
 
				-			topP:        0.9,
			
 
				-			minP:        0.0,
			
 
				-			seed:        0,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "all zeroes - greedy",
			
 
				-			temperature: 0.0,
			
 
				-			topK:        0,
			
 
				-			topP:        0.0,
			
 
				-			minP:        0.0,
			
 
				-			seed:        0,
			
 
				-			wantGreedy:  true,
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "all transforms",
			
 
				-			temperature: 0.8,
			
 
				-			topK:        50,
			
 
				-			topP:        0.95,
			
 
				-			minP:        0.1,
			
 
				-			seed:        42,
			
 
				-			wantGreedy:  false,
			
 
				-		},
			
 
				-	}
			
 
				-	for _, tt := range tests {
			
 
				-		t.Run(tt.name, func(t *testing.T) {
			
 
				-			sampler := NewSampler(tt.temperature, tt.topK, tt.topP, tt.minP, tt.seed)
			
 
				-			_, isGreedy := sampler.(*greedy)
			
 
				-			if isGreedy != tt.wantGreedy {
			
 
				-				t.Errorf("NewSampler() got greedy = %v, want %v", isGreedy, tt.wantGreedy)
			
 
				-			}
			
 
				-		})
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 func BenchmarkSample(b *testing.B) {
			
 
				-	weighted := NewSampler(0.5, 10, 0.9, 0.2, -1)
			
 
				 	samplers := map[string]Sampler{
			
 
				-		"Greedy":   NewSampler(0, 0, 0, 0, 0), // Use NewSampler with temp=0 for greedy
			
 
				-		"Weighted": weighted,
			
 
				+		"Greedy":   NewSampler(0, 0, 0, 0, 0, nil), // Use NewSampler with temp=0 for greedy
			
 
				+		"Weighted": NewSampler(0.5, 10, 0.9, 0.2, -1, nil),
			
 
				 	}
			
 
				 
			
 
				 	// Generate random logits for benchmarking
			
@@ -132,7 +48,7 @@ func BenchmarkSample(b *testing.B) {
 
				 			b.ResetTimer()
			
 
				 			for b.Loop() {
			
 
				 				if _, err := s.Sample(logits); err != nil {
			
 
				-					b.Error(err)
			
 
				+					b.Fatalf("error sampling: %v", err)
			
 
				 				}
			
 
				 			}
			
 
				 		})
			
--- a/sample/transforms.go
+++ b/sample/transforms.go
@@ -5,7 +5,7 @@ import (
 
				 	"slices"
			
 
				 )
			
 
				 
			
 
				-func softmax(ts []logit) []logit {
			
 
				+func softmax(ts []token) []token {
			
 
				 	var sum float32
			
 
				 	for i, v := range ts {
			
 
				 		ts[i].value = float32(math.Exp(float64(v.value)))
			
@@ -19,7 +19,7 @@ func softmax(ts []logit) []logit {
 
				 	return ts
			
 
				 }
			
 
				 
			
 
				-func temperature(ti []logit, t float32) []logit {
			
 
				+func temperature(ti []token, t float32) []token {
			
 
				 	if t == 1 {
			
 
				 		return ti
			
 
				 	}
			
@@ -51,7 +51,7 @@ func temperature(ti []logit, t float32) []logit {
 
				 // 1. Finds the smallest value between the node and its children
			
 
				 // 2. If the node is not the smallest, swaps it with its smallest child
			
 
				 // 3. Continues this process down the affected path until the min-heap property is restored
			
 
				-func siftDown(data []logit, start, end int) {
			
 
				+func siftDown(data []token, start, end int) {
			
 
				 	root := start
			
 
				 	for {
			
 
				 		child := 2*root + 1
			
@@ -73,7 +73,7 @@ func siftDown(data []logit, start, end int) {
 
				 }
			
 
				 
			
 
				 // topK limits the number of tokens considered to the k highest logits
			
 
				-func topK(ts []logit, k int) []logit {
			
 
				+func topK(ts []token, k int) []token {
			
 
				 	if k >= len(ts) {
			
 
				 		return ts
			
 
				 	}
			
@@ -99,7 +99,7 @@ func topK(ts []logit, k int) []logit {
 
				 }
			
 
				 
			
 
				 // topP limits tokens to those with cumulative probability p
			
 
				-func topP(ts []logit, p float32) []logit {
			
 
				+func topP(ts []token, p float32) []token {
			
 
				 	if p == 1.0 {
			
 
				 		return ts
			
 
				 	}
			
@@ -118,7 +118,7 @@ func topP(ts []logit, p float32) []logit {
 
				 }
			
 
				 
			
 
				 // minP limits tokens to those with cumulative probability p
			
 
				-func minP(ts []logit, p float32) []logit {
			
 
				+func minP(ts []token, p float32) []token {
			
 
				 	if p == 1.0 {
			
 
				 		return ts
			
 
				 	}
			
@@ -146,7 +146,7 @@ func minP(ts []logit, p float32) []logit {
 
				 
			
 
				 // TODO(parthsareen): possibly replace with simpler implementation https://github.com/ollama/ollama/issues/9584
			
 
				 // Conting sort implementation to sort tokens by logits
			
 
				-func sortLogits(tokens []logit) {
			
 
				+func sortLogits(tokens []token) {
			
 
				 	if len(tokens) <= 1 {
			
 
				 		return
			
 
				 	}
			
@@ -187,7 +187,7 @@ func sortLogits(tokens []logit) {
 
				 	}
			
 
				 
			
 
				 	// Second pass: place elements in correct position
			
 
				-	output := make([]logit, len(tokens))
			
 
				+	output := make([]token, len(tokens))
			
 
				 	// Track current positions
			
 
				 	countsCopy := counts
			
 
				 
			
--- a/sample/transforms_test.go
+++ b/sample/transforms_test.go
@@ -7,10 +7,10 @@ import (
 
				 )
			
 
				 
			
 
				 // Helper to convert float64 slice to logit slice
			
 
				-func toLogits(values []float64) []logit {
			
 
				-	tokens := make([]logit, len(values))
			
 
				+func toTokens(values []float64) []token {
			
 
				+	tokens := make([]token, len(values))
			
 
				 	for i, v := range values {
			
 
				-		tokens[i] = logit{
			
 
				+		tokens[i] = token{
			
 
				 			id:    int32(i),
			
 
				 			value: float32(v),
			
 
				 		}
			
@@ -19,7 +19,7 @@ func toLogits(values []float64) []logit {
 
				 }
			
 
				 
			
 
				 // Helper to compare logit slices
			
 
				-func compareLogits(t *testing.T, name string, want []float64, got []logit) {
			
 
				+func compareLogits(t *testing.T, name string, want []float64, got []token) {
			
 
				 	t.Helper()
			
 
				 	if len(want) != len(got) {
			
 
				 		t.Errorf("%s: length mismatch: want %d, got %d", name, len(want), len(got))
			
@@ -36,13 +36,13 @@ func TestTemperature(t *testing.T) {
 
				 	input := []float64{2, -1, 4, -3, 1, -2, 0}
			
 
				 	want := []float64{-4, -10, 0, -14, -6, -12, -8} // (logit - max logit) / temp
			
 
				 
			
 
				-	got := temperature(toLogits(input), 0.5)
			
 
				+	got := temperature(toTokens(input), 0.5)
			
 
				 	compareLogits(t, "Temperature", want, got)
			
 
				 }
			
 
				 
			
 
				 func TestSoftmax(t *testing.T) {
			
 
				 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
			
 
				-	got := softmax(toLogits(input))
			
 
				+	got := softmax(toTokens(input))
			
 
				 
			
 
				 	// Check probabilities sum to 1
			
 
				 	var sum float32
			
@@ -65,7 +65,7 @@ func TestTopK(t *testing.T) {
 
				 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
			
 
				 
			
 
				 	// Test k=3
			
 
				-	got := topK(toLogits(input), 3)
			
 
				+	got := topK(toTokens(input), 3)
			
 
				 	if len(got) != 3 {
			
 
				 		t.Errorf("topK(3): wrong length: want 3, got %d", len(got))
			
 
				 	}
			
@@ -74,13 +74,13 @@ func TestTopK(t *testing.T) {
 
				 	compareLogits(t, "topK(3)", want, got)
			
 
				 
			
 
				 	// Test k > len
			
 
				-	got = topK(toLogits(input), 10)
			
 
				+	got = topK(toTokens(input), 10)
			
 
				 	compareLogits(t, "topK(10)", input, got)
			
 
				 }
			
 
				 
			
 
				 func TestTopP(t *testing.T) {
			
 
				 	input := []float64{-3, -2, -1, 0, 1, 2, 4}
			
 
				-	tokens := toLogits(input)
			
 
				+	tokens := toTokens(input)
			
 
				 
			
 
				 	// First apply temperature and softmax to get probabilities
			
 
				 	tokens = temperature(tokens, 1)
			
@@ -99,7 +99,7 @@ func TestTopP(t *testing.T) {
 
				 
			
 
				 func TestMinP(t *testing.T) {
			
 
				 	input := []float64{-3, -2, -1, 0, 1, 2, 4, 3}
			
 
				-	tokens := toLogits(input)
			
 
				+	tokens := toTokens(input)
			
 
				 
			
 
				 	// First apply temperature and softmax
			
 
				 	tokens = temperature(tokens, 1)
			
@@ -116,7 +116,7 @@ func TestMinP(t *testing.T) {
 
				 
			
 
				 func TestSortLogits(t *testing.T) {
			
 
				 	input := []float64{3, 1, 4, 2, -1, 0, -2}
			
 
				-	tokens := toLogits(input)
			
 
				+	tokens := toTokens(input)
			
 
				 
			
 
				 	sortLogits(tokens)
			
 
				 
			
@@ -133,15 +133,15 @@ func TestSortLogits(t *testing.T) {
 
				 
			
 
				 func BenchmarkTransforms(b *testing.B) {
			
 
				 	// Generate random logits
			
 
				-	tokens := make([]logit, 1<<16)
			
 
				+	tokens := make([]token, 1<<16)
			
 
				 	for i := range tokens {
			
 
				-		tokens[i] = logit{
			
 
				+		tokens[i] = token{
			
 
				 			id:    int32(i),
			
 
				 			value: rand.Float32(),
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	tokensCopy := make([]logit, len(tokens))
			
 
				+	tokensCopy := make([]token, len(tokens))
			
 
				 
			
 
				 	b.Run("Temperature", func(b *testing.B) {
			
 
				 		b.ResetTimer()