1 month ago · a1cda80bcb
--- a/kvcache/cache.go
+++ b/kvcache/cache.go
@@ -4,6 +4,7 @@ import (
 
															 	"errors"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 var (
														
@@ -51,7 +52,7 @@ type Cache interface {
 
															 	// StartForward is called before the start of the model's forward pass.
														
 
															 	// For each token in the coming batch, there must be a corresponding
														
 
															 	// entry in positions and seqs.
														
 
															-	StartForward(ctx ml.Context, positions []int32, seqs []int) error
														
 
															+	StartForward(ctx ml.Context, opts input.Options) error
														
 
															 	// CopyPrefix copies tokens in the range [0, len) from srcSeq to dstSeq
														
 
															 	CopyPrefix(srcSeq, dstSeq int, len int32)
														
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -8,6 +8,7 @@ import (
 
															 	"slices"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 type shiftFn func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error)
														
@@ -140,10 +141,10 @@ func (c *Causal) Close() {
 
															 	}
														
 
															 }
														
 
															-func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
														
 
															-	c.curBatchSize = len(positions)
														
 
															-	c.curSequences = seqs
														
 
															-	c.curPositions = positions
														
 
															+func (c *Causal) StartForward(ctx ml.Context, opts input.Options) error {
														
 
															+	c.curBatchSize = len(opts.Positions)
														
 
															+	c.curSequences = opts.Sequences
														
 
															+	c.curPositions = opts.Positions
														
 
															 	var err error
														
 
															 	c.curLoc, err = c.findStartLoc()
														
@@ -156,8 +157,8 @@ func (c *Causal) StartForward(ctx ml.Context, positions []int32, seqs []int) err
 
															 	}
														
 
															 	c.curCellRange = newRange()
														
 
															-	for i, pos := range positions {
														
 
															-		seq := seqs[i]
														
 
															+	for i, pos := range opts.Positions {
														
 
															+		seq := opts.Sequences[i]
														
 
															 		c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
														
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -6,6 +6,7 @@ import (
 
															 	"testing"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 type testCase struct {
														
@@ -269,7 +270,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 
															 			context := backend.NewContext()
														
 
															 			defer context.Close()
														
 
															-			err := cache.StartForward(context, test.pos, test.seqs)
														
 
															+			err := cache.StartForward(context, input.Options{Positions: test.pos, Sequences: test.seqs})
														
 
															 			if err != nil {
														
 
															 				panic(err)
														
 
															 			}
														
--- a/kvcache/encoder.go
+++ b/kvcache/encoder.go
@@ -4,6 +4,7 @@ import (
 
															 	"fmt"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 // Encoder cache stores K and V tensors that are position independent
														
@@ -78,9 +79,11 @@ func (c *EncoderCache) Close() {
 
															 	}
														
 
															 }
														
 
															-func (c *EncoderCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
														
 
															-	// The image is always in the first position
														
 
															-	c.curPos = positions[0]
														
 
															+func (c *EncoderCache) StartForward(ctx ml.Context, opts input.Options) error {
														
 
															+	// We work with the most recent image
														
 
															+	if len(opts.Multimodal) > 0 {
														
 
															+		c.curPos = opts.Positions[opts.Multimodal[len(opts.Multimodal)-1].Index]
														
 
															+	}
														
 
															 	return nil
														
 
															 }
														
--- a/kvcache/wrapper.go
+++ b/kvcache/wrapper.go
@@ -4,6 +4,7 @@ import (
 
															 	"math"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 // Wrapper cache is a container for multiple types of caches,
														
@@ -40,14 +41,14 @@ func (c *WrapperCache) Close() {
 
															 	}
														
 
															 }
														
 
															-func (c *WrapperCache) StartForward(ctx ml.Context, positions []int32, seqs []int) error {
														
 
															+func (c *WrapperCache) StartForward(ctx ml.Context, opts input.Options) error {
														
 
															 	for i, cache := range c.caches {
														
 
															-		err := cache.StartForward(ctx, positions, seqs)
														
 
															+		err := cache.StartForward(ctx, opts)
														
 
															 		if err != nil {
														
 
															 			// unwind on error - Remove with endIndex set to math.MaxInt32 does not fail
														
 
															 			for j := i - 1; j >= 0; j-- {
														
 
															-				for k := range positions {
														
 
															-					_ = c.caches[j].Remove(seqs[k], positions[k], math.MaxInt32)
														
 
															+				for k := range opts.Positions {
														
 
															+					_ = c.caches[j].Remove(opts.Sequences[k], opts.Positions[k], math.MaxInt32)
														
 
															 				}
														
 
															 			}
														
 
															 			return err
														
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -0,0 +1,37 @@
 
															+package input
														
 
															+
														
 
															+// Input represents one token in the input stream
														
 
															+type Input struct {
														
 
															+	// Token is a single element of text.
														
 
															+	Token int32
														
 
															+
														
 
															+	// Multimodal is opaque data representing a non-text
														
 
															+	// element such as an image (or part of one if the image
														
 
															+	// can be processed in pieces). It may be either together
														
 
															+	// with Token or on its own.
														
 
															+	Multimodal any
														
 
															+
														
 
															+	// MultimodalHash is a unique representation of the data
														
 
															+	// stored in Multimodal, used for caching and comparing
														
 
															+	// equality.
														
 
															+	MultimodalHash uint64
														
 
															+}
														
 
															+
														
 
															+// MultimodalIndex is a multimodal element (such as an image)
														
 
															+// together with an index into the slice of Inputs with the
														
 
															+// corresponding token. Note that the index is not the same
														
 
															+// as the position - to find that use the index with the
														
 
															+// Positions slice.
														
 
															+type MultimodalIndex struct {
														
 
															+	Index      int
														
 
															+	Multimodal any
														
 
															+}
														
 
															+
														
 
															+// Options contains the inputs for a model forward pass
														
 
															+type Options struct {
														
 
															+	Inputs     []int32
														
 
															+	Multimodal []MultimodalIndex
														
 
															+	Positions  []int32
														
 
															+	Sequences  []int
														
 
															+	Outputs    []int32
														
 
															+}
														
--- a/model/model.go
+++ b/model/model.go
@@ -19,66 +19,12 @@ import (
 
															 	"github.com/ollama/ollama/kvcache"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															 	_ "github.com/ollama/ollama/ml/backend"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															-// Input represents one token in the input stream
														
 
															-type Input struct {
														
 
															-	// Token is a single element of text.
														
 
															-	Token int32
														
 
															-
														
 
															-	// Multimodal is opaque data representing a non-text
														
 
															-	// element such as an image (or part of one if the image
														
 
															-	// can be processed in pieces). It may be either together
														
 
															-	// with Token or on its own.
														
 
															-	Multimodal any
														
 
															-
														
 
															-	// MultimodalHash is a unique representation of the data
														
 
															-	// stored in Multimodal, used for caching and comparing
														
 
															-	// equality.
														
 
															-	MultimodalHash uint64
														
 
															-}
														
 
															-
														
 
															-// MultimodalIndex is a multimodal element (such as an image)
														
 
															-// together with an index into the slice of Inputs with the
														
 
															-// corresponding token. Note that the index is not the same
														
 
															-// as the position - to find that use the index with the
														
 
															-// Positions slice.
														
 
															-type MultimodalIndex struct {
														
 
															-	Index      int
														
 
															-	Multimodal any
														
 
															-}
														
 
															-
														
 
															-// Options contains the inputs for a model forward pass
														
 
															-type Options struct {
														
 
															-	Inputs     []int32
														
 
															-	Multimodal []MultimodalIndex
														
 
															-	Positions  []int32
														
 
															-	Sequences  []int
														
 
															-	Outputs    []int32
														
 
															-}
														
 
															-
														
 
															-type config struct {
														
 
															-	Cache kvcache.Cache
														
 
															-}
														
 
															-
														
 
															-// Base implements the common fields and methods for all models
														
 
															-type Base struct {
														
 
															-	b ml.Backend
														
 
															-	config
														
 
															-}
														
 
															-
														
 
															-// Backend returns the underlying backend that will run the model
														
 
															-func (m *Base) Backend() ml.Backend {
														
 
															-	return m.b
														
 
															-}
														
 
															-
														
 
															-func (m *Base) Config() config {
														
 
															-	return m.config
														
 
															-}
														
 
															-
														
 
															 // Model implements a specific model architecture, defining the forward pass and any model-specific configuration
														
 
															 type Model interface {
														
 
															-	Forward(ml.Context, Options) (ml.Tensor, error)
														
 
															+	Forward(ml.Context, input.Options) (ml.Tensor, error)
														
 
															 	Backend() ml.Backend
														
 
															 	Config() config
														
@@ -112,7 +58,26 @@ type MultimodalProcessor interface {
 
															 	// This function is also responsible for updating MultimodalHash for any Multimodal
														
 
															 	// that is modified to ensure that there is a unique hash value that accurately
														
 
															 	// represents the contents.
														
 
															-	PostTokenize(ml.Context, []Input) ([]Input, error)
														
 
															+	PostTokenize(ml.Context, []input.Input) ([]input.Input, error)
														
 
															+}
														
 
															+
														
 
															+// Base implements the common fields and methods for all models
														
 
															+type Base struct {
														
 
															+	b ml.Backend
														
 
															+	config
														
 
															+}
														
 
															+
														
 
															+type config struct {
														
 
															+	Cache kvcache.Cache
														
 
															+}
														
 
															+
														
 
															+// Backend returns the underlying backend that will run the model
														
 
															+func (m *Base) Backend() ml.Backend {
														
 
															+	return m.b
														
 
															+}
														
 
															+
														
 
															+func (m *Base) Config() config {
														
 
															+	return m.config
														
 
															 }
														
 
															 var models = make(map[string]func(ml.Config) (Model, error))
														
@@ -313,7 +278,7 @@ func canNil(t reflect.Type) bool {
 
															 		t.Kind() == reflect.Slice
														
 
															 }
														
 
															-func Forward(ctx ml.Context, m Model, opts Options) (ml.Tensor, error) {
														
 
															+func Forward(ctx ml.Context, m Model, opts input.Options) (ml.Tensor, error) {
														
 
															 	if len(opts.Positions) != len(opts.Sequences) {
														
 
															 		return nil, fmt.Errorf("length of positions (%v) must match length of seqs (%v)", len(opts.Positions), len(opts.Sequences))
														
 
															 	}
														
@@ -324,7 +289,7 @@ func Forward(ctx ml.Context, m Model, opts Options) (ml.Tensor, error) {
 
															 	cache := m.Config().Cache
														
 
															 	if cache != nil {
														
 
															-		err := cache.StartForward(ctx, opts.Positions, opts.Sequences)
														
 
															+		err := cache.StartForward(ctx, opts)
														
 
															 		if err != nil {
														
 
															 			return nil, err
														
 
															 		}
														
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -11,6 +11,7 @@ import (
 
															 	"github.com/ollama/ollama/ml"
														
 
															 	"github.com/ollama/ollama/ml/backend/ggml"
														
 
															 	"github.com/ollama/ollama/ml/nn"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 func TestParseTags(t *testing.T) {
														
@@ -162,7 +163,7 @@ func TestGetTextProcessor(t *testing.T) {
 
															 type notTextProcessorModel struct{}
														
 
															-func (notTextProcessorModel) Forward(ml.Context, Options) (ml.Tensor, error) {
														
 
															+func (notTextProcessorModel) Forward(ml.Context, input.Options) (ml.Tensor, error) {
														
 
															 	panic("unimplemented")
														
 
															 }
														
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -9,6 +9,7 @@ import (
 
															 	"github.com/ollama/ollama/ml"
														
 
															 	"github.com/ollama/ollama/ml/nn"
														
 
															 	"github.com/ollama/ollama/model"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 type Options struct {
														
@@ -137,7 +138,7 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 
															 	return hiddenState.Add(ctx, residual)
														
 
															 }
														
 
															-func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
														
 
															+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
														
 
															 	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
														
 
															 	if err != nil {
														
 
															 		return nil, err
														
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -12,6 +12,7 @@ import (
 
															 	"github.com/ollama/ollama/ml"
														
 
															 	"github.com/ollama/ollama/ml/nn"
														
 
															 	"github.com/ollama/ollama/model"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 type Model struct {
														
@@ -101,8 +102,8 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 
															 	return m.Projector.Forward(ctx, crossAttentionStates), nil
														
 
															 }
														
 
															-func (m *Model) PostTokenize(ctx ml.Context, inputs []model.Input) ([]model.Input, error) {
														
 
															-	var images []model.Input
														
 
															+func (m *Model) PostTokenize(ctx ml.Context, inputs []input.Input) ([]input.Input, error) {
														
 
															+	var images []input.Input
														
 
															 	fnvHash := fnv.New64a()
														
 
															 	for i := range inputs {
														
@@ -125,15 +126,15 @@ func (m *Model) PostTokenize(ctx ml.Context, inputs []model.Input) ([]model.Inpu
 
															 		}
														
 
															 	}
														
 
															-	inputs = slices.DeleteFunc(inputs, func(input model.Input) bool { return input.Token == -1 })
														
 
															+	inputs = slices.DeleteFunc(inputs, func(input input.Input) bool { return input.Token == -1 })
														
 
															 	return inputs, nil
														
 
															 }
														
 
															-func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
														
 
															+func (m *Model) Forward(ctx ml.Context, opts input.Options) (ml.Tensor, error) {
														
 
															 	var crossAttentionStates ml.Tensor
														
 
															-	if opts.Multimodal != nil {
														
 
															-		crossAttentionStates = opts.Multimodal[0].Multimodal.(ml.Tensor)
														
 
															+	if len(opts.Multimodal) > 0 {
														
 
															+		crossAttentionStates = opts.Multimodal[len(opts.Multimodal)-1].Multimodal.(ml.Tensor)
														
 
															 	}
														
 
															 	inputs, err := ctx.Input().FromIntSlice(opts.Inputs, len(opts.Inputs))
														
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -10,6 +10,7 @@ import (
 
															 	"github.com/ollama/ollama/kvcache"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															 	"github.com/ollama/ollama/model"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 type InputCache struct {
														
@@ -79,7 +80,7 @@ type InputCacheSlot struct {
 
															 	Id int
														
 
															 	// Inputs that are stored in the KV cache
														
 
															-	Inputs []model.Input
														
 
															+	Inputs []input.Input
														
 
															 	// is this cache actively being processed as part of a sequence?
														
 
															 	InUse bool
														
@@ -88,7 +89,7 @@ type InputCacheSlot struct {
 
															 	lastUsed time.Time
														
 
															 }
														
 
															-func (c *InputCache) LoadCacheSlot(prompt []model.Input, cachePrompt bool) (*InputCacheSlot, []model.Input, error) {
														
 
															+func (c *InputCache) LoadCacheSlot(prompt []input.Input, cachePrompt bool) (*InputCacheSlot, []input.Input, error) {
														
 
															 	var slot *InputCacheSlot
														
 
															 	var numPast int32
														
 
															 	var err error
														
@@ -139,7 +140,7 @@ func (c *InputCache) LoadCacheSlot(prompt []model.Input, cachePrompt bool) (*Inp
 
															 	return slot, prompt, nil
														
 
															 }
														
 
															-func (c *InputCache) findLongestCacheSlot(prompt []model.Input) (*InputCacheSlot, int32, error) {
														
 
															+func (c *InputCache) findLongestCacheSlot(prompt []input.Input) (*InputCacheSlot, int32, error) {
														
 
															 	longest := int32(-1)
														
 
															 	var longestSlot *InputCacheSlot
														
@@ -162,7 +163,7 @@ func (c *InputCache) findLongestCacheSlot(prompt []model.Input) (*InputCacheSlot
 
															 	return longestSlot, longest, nil
														
 
															 }
														
 
															-func (c *InputCache) findBestCacheSlot(prompt []model.Input) (*InputCacheSlot, int32, error) {
														
 
															+func (c *InputCache) findBestCacheSlot(prompt []input.Input) (*InputCacheSlot, int32, error) {
														
 
															 	oldest := time.Now()
														
 
															 	var oldestSlot *InputCacheSlot
														
@@ -198,7 +199,7 @@ func (c *InputCache) findBestCacheSlot(prompt []model.Input) (*InputCacheSlot, i
 
															 	if longest > 0 && longestSlot != oldestSlot {
														
 
															 		slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
														
 
															 			len(longestSlot.Inputs))
														
 
															-		oldestSlot.Inputs = make([]model.Input, longest)
														
 
															+		oldestSlot.Inputs = make([]input.Input, longest)
														
 
															 		copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
														
 
															 		if c.cache != nil {
														
 
															 			c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
														
@@ -208,7 +209,7 @@ func (c *InputCache) findBestCacheSlot(prompt []model.Input) (*InputCacheSlot, i
 
															 	return oldestSlot, longest, nil
														
 
															 }
														
 
															-func countCommonPrefix(a []model.Input, b []model.Input) int32 {
														
 
															+func countCommonPrefix(a []input.Input, b []input.Input) int32 {
														
 
															 	var count int32
														
 
															 	for i := range a {
														
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -5,7 +5,7 @@ import (
 
															 	"testing"
														
 
															 	"time"
														
 
															-	"github.com/ollama/ollama/model"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 )
														
 
															 func TestCountCommon(t *testing.T) {
														
@@ -15,50 +15,50 @@ func TestCountCommon(t *testing.T) {
 
															 	tests := []struct {
														
 
															 		name     string
														
 
															-		t1       []model.Input
														
 
															-		t2       []model.Input
														
 
															+		t1       []input.Input
														
 
															+		t2       []input.Input
														
 
															 		expected int32
														
 
															 	}{
														
 
															 		{
														
 
															 			name:     "Equal",
														
 
															-			t1:       []model.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															-			t2:       []model.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															+			t1:       []input.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															+			t2:       []input.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															 			expected: 3,
														
 
															 		},
														
 
															 		{
														
 
															 			name:     "Prefix",
														
 
															-			t1:       []model.Input{{Token: 1}},
														
 
															-			t2:       []model.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															+			t1:       []input.Input{{Token: 1}},
														
 
															+			t2:       []input.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															 			expected: 1,
														
 
															 		},
														
 
															 		{
														
 
															 			name:     "Image Prefix",
														
 
															-			t1:       []model.Input{{Multimodal: imgA, MultimodalHash: 1}},
														
 
															-			t2:       []model.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
														
 
															+			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
														
 
															+			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
														
 
															 			expected: 1,
														
 
															 		},
														
 
															 		{
														
 
															 			name:     "Mixed",
														
 
															-			t1:       []model.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
														
 
															-			t2:       []model.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
														
 
															+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
														
 
															+			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
														
 
															 			expected: 2,
														
 
															 		},
														
 
															 		{
														
 
															 			name:     "Mixed, Same Length",
														
 
															-			t1:       []model.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
														
 
															-			t2:       []model.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
														
 
															+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
														
 
															+			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
														
 
															 			expected: 1,
														
 
															 		},
														
 
															 		{
														
 
															 			name:     "Empty",
														
 
															-			t1:       []model.Input{},
														
 
															-			t2:       []model.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															+			t1:       []input.Input{},
														
 
															+			t2:       []input.Input{{Token: 1}, {Token: 2}, {Token: 3}},
														
 
															 			expected: 0,
														
 
															 		},
														
 
															 		{
														
 
															 			name:     "Both Empty",
														
 
															-			t1:       []model.Input{},
														
 
															-			t2:       []model.Input{},
														
 
															+			t1:       []input.Input{},
														
 
															+			t2:       []input.Input{},
														
 
															 			expected: 0,
														
 
															 		},
														
 
															 	}
														
@@ -82,7 +82,7 @@ func TestFindCacheSlot(t *testing.T) {
 
															 	tests := []struct {
														
 
															 		name    string
														
 
															 		cache   InputCache
														
 
															-		prompt  []model.Input
														
 
															+		prompt  []input.Input
														
 
															 		longest expected
														
 
															 		best    expected
														
 
															 	}{
														
@@ -91,18 +91,18 @@ func TestFindCacheSlot(t *testing.T) {
 
															 			cache: InputCache{slots: []InputCacheSlot{
														
 
															 				{
														
 
															 					Id:       0,
														
 
															-					Inputs:   []model.Input{},
														
 
															+					Inputs:   []input.Input{},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Time{},
														
 
															 				},
														
 
															 				{
														
 
															 					Id:       1,
														
 
															-					Inputs:   []model.Input{},
														
 
															+					Inputs:   []input.Input{},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Time{},
														
 
															 				},
														
 
															 			}},
														
 
															-			prompt:  []model.Input{{Token: 1}},
														
 
															+			prompt:  []input.Input{{Token: 1}},
														
 
															 			longest: expected{result: 0, len: 0},
														
 
															 			best:    expected{result: 0, len: 0},
														
 
															 		},
														
@@ -111,18 +111,18 @@ func TestFindCacheSlot(t *testing.T) {
 
															 			cache: InputCache{slots: []InputCacheSlot{
														
 
															 				{
														
 
															 					Id:       0,
														
 
															-					Inputs:   []model.Input{{Token: 1}},
														
 
															+					Inputs:   []input.Input{{Token: 1}},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Now().Add(-time.Second),
														
 
															 				},
														
 
															 				{
														
 
															 					Id:       1,
														
 
															-					Inputs:   []model.Input{{Token: 1}, {Token: 2}},
														
 
															+					Inputs:   []input.Input{{Token: 1}, {Token: 2}},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Now().Add(-2 * time.Second),
														
 
															 				},
														
 
															 			}},
														
 
															-			prompt:  []model.Input{{Token: 1}, {Token: 2}},
														
 
															+			prompt:  []input.Input{{Token: 1}, {Token: 2}},
														
 
															 			longest: expected{result: 1, len: 2},
														
 
															 			best:    expected{result: 1, len: 2},
														
 
															 		},
														
@@ -131,18 +131,18 @@ func TestFindCacheSlot(t *testing.T) {
 
															 			cache: InputCache{slots: []InputCacheSlot{
														
 
															 				{
														
 
															 					Id:       0,
														
 
															-					Inputs:   []model.Input{{Token: 1}, {Token: 2}},
														
 
															+					Inputs:   []input.Input{{Token: 1}, {Token: 2}},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Now().Add(-time.Second),
														
 
															 				},
														
 
															 				{
														
 
															 					Id:       1,
														
 
															-					Inputs:   []model.Input{},
														
 
															+					Inputs:   []input.Input{},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Time{},
														
 
															 				},
														
 
															 			}},
														
 
															-			prompt:  []model.Input{{Token: 2}},
														
 
															+			prompt:  []input.Input{{Token: 2}},
														
 
															 			longest: expected{result: 0, len: 0},
														
 
															 			best:    expected{result: 1, len: 0},
														
 
															 		},
														
@@ -152,19 +152,19 @@ func TestFindCacheSlot(t *testing.T) {
 
															 				slots: []InputCacheSlot{
														
 
															 					{
														
 
															 						Id:       0,
														
 
															-						Inputs:   []model.Input{{Token: 1}, {Token: 2}},
														
 
															+						Inputs:   []input.Input{{Token: 1}, {Token: 2}},
														
 
															 						InUse:    false,
														
 
															 						lastUsed: time.Now().Add(-time.Second),
														
 
															 					},
														
 
															 					{
														
 
															 						Id:       1,
														
 
															-						Inputs:   []model.Input{},
														
 
															+						Inputs:   []input.Input{},
														
 
															 						InUse:    false,
														
 
															 						lastUsed: time.Time{},
														
 
															 					},
														
 
															 				},
														
 
															 			},
														
 
															-			prompt:  []model.Input{{Token: 1}},
														
 
															+			prompt:  []input.Input{{Token: 1}},
														
 
															 			longest: expected{result: 0, len: 1},
														
 
															 			best:    expected{result: 1, len: 1},
														
 
															 		},
														
@@ -173,18 +173,18 @@ func TestFindCacheSlot(t *testing.T) {
 
															 			cache: InputCache{slots: []InputCacheSlot{
														
 
															 				{
														
 
															 					Id:       0,
														
 
															-					Inputs:   []model.Input{{Token: 1}},
														
 
															+					Inputs:   []input.Input{{Token: 1}},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Now().Add(-time.Second),
														
 
															 				},
														
 
															 				{
														
 
															 					Id:       1,
														
 
															-					Inputs:   []model.Input{{Token: 1}, {Token: 2}},
														
 
															+					Inputs:   []input.Input{{Token: 1}, {Token: 2}},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Now().Add(-2 * time.Second),
														
 
															 				},
														
 
															 			}},
														
 
															-			prompt:  []model.Input{{Token: 2}, {Token: 3}},
														
 
															+			prompt:  []input.Input{{Token: 2}, {Token: 3}},
														
 
															 			longest: expected{result: 0, len: 0},
														
 
															 			best:    expected{result: 1, len: 0},
														
 
															 		},
														
@@ -193,18 +193,18 @@ func TestFindCacheSlot(t *testing.T) {
 
															 			cache: InputCache{slots: []InputCacheSlot{
														
 
															 				{
														
 
															 					Id:       0,
														
 
															-					Inputs:   []model.Input{{Token: 1}, {Token: 2}},
														
 
															+					Inputs:   []input.Input{{Token: 1}, {Token: 2}},
														
 
															 					InUse:    true,
														
 
															 					lastUsed: time.Now().Add(-time.Second),
														
 
															 				},
														
 
															 				{
														
 
															 					Id:       1,
														
 
															-					Inputs:   []model.Input{{Token: 1}},
														
 
															+					Inputs:   []input.Input{{Token: 1}},
														
 
															 					InUse:    false,
														
 
															 					lastUsed: time.Now().Add(-2 * time.Second),
														
 
															 				},
														
 
															 			}},
														
 
															-			prompt:  []model.Input{{Token: 1}, {Token: 2}},
														
 
															+			prompt:  []input.Input{{Token: 1}, {Token: 2}},
														
 
															 			longest: expected{result: 1, len: 1},
														
 
															 			best:    expected{result: 1, len: 2},
														
 
															 		},
														
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -26,6 +26,7 @@ import (
 
															 	"github.com/ollama/ollama/api"
														
 
															 	"github.com/ollama/ollama/ml"
														
 
															 	"github.com/ollama/ollama/model"
														
 
															+	"github.com/ollama/ollama/model/input"
														
 
															 	"github.com/ollama/ollama/runner/common"
														
 
															 	"github.com/ollama/ollama/sample"
														
@@ -41,10 +42,10 @@ type Sequence struct {
 
															 	iBatch int
														
 
															 	// prompt inputs left to evaluate
														
 
															-	inputs []model.Input
														
 
															+	inputs []input.Input
														
 
															 	// inputs that have been added to a batch but not yet submitted to Forward
														
 
															-	pendingInputs []model.Input
														
 
															+	pendingInputs []input.Input
														
 
															 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
														
 
															 	pendingResponses []string
														
@@ -144,8 +145,8 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 
															 // inputs processes the prompt and images into a list of inputs
														
 
															 // by splitting the prompt on [img-<n>] tags, tokenizing text and
														
 
															 // decoding images
														
 
															-func (s *Server) inputs(ctx ml.Context, prompt string, images []ImageData) ([]model.Input, error) {
														
 
															-	var inputs []model.Input
														
 
															+func (s *Server) inputs(ctx ml.Context, prompt string, images []ImageData) ([]input.Input, error) {
														
 
															+	var inputs []input.Input
														
 
															 	var parts []string
														
 
															 	var matches [][]string
														
@@ -168,7 +169,7 @@ func (s *Server) inputs(ctx ml.Context, prompt string, images []ImageData) ([]mo
 
															 		}
														
 
															 		for _, t := range tokens {
														
 
															-			inputs = append(inputs, model.Input{Token: t})
														
 
															+			inputs = append(inputs, input.Input{Token: t})
														
 
															 		}
														
 
															 		// image - decode and store
														
@@ -196,7 +197,7 @@ func (s *Server) inputs(ctx ml.Context, prompt string, images []ImageData) ([]mo
 
															 			_, _ = s.multimodalHash.Write(images[imageIndex].Data)
														
 
															 			imageHash := s.multimodalHash.Sum64()
														
 
															-			inputs = append(inputs, model.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
														
 
															+			inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
														
 
															 			postTokenize = true
														
 
															 		}
														
 
															 	}
														
@@ -250,9 +251,6 @@ type Server struct {
 
															 	// KV cache
														
 
															 	cache *InputCache
														
 
															-	// next sequence for prompt processing to avoid starvation
														
 
															-	nextSeq int
														
 
															-
														
 
															 	// multimodalHash generates hashes for comparing equality
														
 
															 	// of non-text data
														
 
															 	multimodalHash maphash.Hash
														
@@ -329,29 +327,25 @@ func (s *Server) processBatch() error {
 
															 	}
														
 
															 	defer s.mu.Unlock()
														
 
															-	var options model.Options
														
 
															-
														
 
															-	seqIdx := s.nextSeq - 1
														
 
															-	for range s.seqs {
														
 
															-		seqIdx = (seqIdx + 1) % len(s.seqs)
														
 
															-		seq := s.seqs[seqIdx]
														
 
															+	var options input.Options
														
 
															+	for i, seq := range s.seqs {
														
 
															 		if seq == nil {
														
 
															 			continue
														
 
															 		}
														
 
															 		// if past the num predict limit
														
 
															 		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
														
 
															-			s.removeSequence(seqIdx, "limit")
														
 
															+			s.removeSequence(i, "limit")
														
 
															 			continue
														
 
															 		}
														
 
															 		if !s.cache.enabled {
														
 
															 			seq.inputs = append(seq.cache.Inputs, seq.inputs...)
														
 
															-			seq.cache.Inputs = []model.Input{}
														
 
															+			seq.cache.Inputs = []input.Input{}
														
 
															 		}
														
 
															-		for i, input := range seq.inputs {
														
 
															+		for j, inp := range seq.inputs {
														
 
															 			if int32(len(seq.cache.Inputs)+len(seq.pendingInputs)+1) > s.cache.numCtx {
														
 
															 				if len(seq.pendingInputs) == 0 {
														
 
															 					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
														
@@ -363,33 +357,23 @@ func (s *Server) processBatch() error {
 
															 				}
														
 
															 			}
														
 
															-			if i >= s.batchSize {
														
 
															-				break
														
 
															-			}
														
 
															-
														
 
															-			// TODO(jessegross): This is a workaround for generating an attention mask and also providing a hint
														
 
															-			// to the encoder cache.
														
 
															-			//
														
 
															-			// Break the batch when switching from text to images so that images are always at the beginning.
														
 
															-			if input.Multimodal != nil && !(len(seq.pendingInputs) == 0 ||
														
 
															-				(len(options.Multimodal) > 0 && options.Multimodal[len(options.Multimodal)-1].Index == len(options.Inputs)-1)) {
														
 
															-				s.nextSeq = seqIdx
														
 
															+			if j >= s.batchSize {
														
 
															 				break
														
 
															 			}
														
 
															-			options.Inputs = append(options.Inputs, input.Token)
														
 
															-			if input.Multimodal != nil {
														
 
															-				options.Multimodal = append(options.Multimodal, model.MultimodalIndex{Index: len(options.Inputs) - 1, Multimodal: input.Multimodal})
														
 
															+			options.Inputs = append(options.Inputs, inp.Token)
														
 
															+			if inp.Multimodal != nil {
														
 
															+				options.Multimodal = append(options.Multimodal, input.MultimodalIndex{Index: len(options.Inputs) - 1, Multimodal: inp.Multimodal})
														
 
															 			}
														
 
															 			options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
														
 
															 			options.Sequences = append(options.Sequences, seq.cache.Id)
														
 
															 			seq.iBatch = len(options.Outputs)
														
 
															-			if i+1 == len(seq.inputs) {
														
 
															+			if j+1 == len(seq.inputs) {
														
 
															 				options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
														
 
															 			}
														
 
															-			seq.pendingInputs = append(seq.pendingInputs, input)
														
 
															+			seq.pendingInputs = append(seq.pendingInputs, inp)
														
 
															 		}
														
 
															 		seq.inputs = seq.inputs[len(seq.pendingInputs):]
														
@@ -417,7 +401,7 @@ func (s *Server) processBatch() error {
 
															 		// After calling Forward, pending inputs are now in the cache
														
 
															 		if len(seq.pendingInputs) > 0 {
														
 
															 			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
														
 
															-			seq.pendingInputs = []model.Input{}
														
 
															+			seq.pendingInputs = []input.Input{}
														
 
															 		}
														
 
															 		// don't sample prompt processing
														
@@ -464,7 +448,7 @@ func (s *Server) processBatch() error {
 
															 			return err
														
 
															 		}
														
 
															-		seq.inputs = []model.Input{{Token: token}}
														
 
															+		seq.inputs = []input.Input{{Token: token}}
														
 
															 		seq.pendingResponses = append(seq.pendingResponses, piece)
														
 
															 		sequence := strings.Join(seq.pendingResponses, "")