123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- package kvcache
- import (
- "fmt"
- "github.com/ollama/ollama/ml"
- "github.com/ollama/ollama/model/input"
- )
- // Encoder cache stores K and V tensors that are position independent
- //
- // The tensors can be of any shape and will be returned as they were stored
- // The mask is currently always nil
- //
- // Not currently safe for multiple sequences
- type EncoderCache struct {
- // config controls mostly backend-specific optimizations
- config *ml.CacheConfig
- // ** current forward pass **
- // the active layer for Get and Put
- curLayer int
- // if something is stored during this pass, this
- // will be the position (but there is no guarantee
- // anything will be stored)
- curPos int32
- // ** cache metadata **
- // was something stored in the cache?
- encoderCached bool
- // position of the cached data
- encoderPos int32
- // ** cache data storage **
- backend ml.Backend
- ctxs map[int]ml.Context
- keys, values map[int]ml.Tensor
- }
- func NewEncoderCache() *EncoderCache {
- return &EncoderCache{
- ctxs: make(map[int]ml.Context),
- keys: make(map[int]ml.Tensor),
- values: make(map[int]ml.Tensor),
- }
- }
- func (c *EncoderCache) Init(backend ml.Backend, dtype ml.DType, capacity int32) {
- if c.config == nil {
- var config ml.CacheConfig
- if cc, ok := backend.(ml.BackendCacheConfig); ok {
- config = cc.CacheConfig()
- }
- c.config = &config
- }
- if c.config.CachePadding != 0 && c.config.CachePadding != 1 {
- panic(fmt.Errorf("encoder cache is unable to enforce requested CachePadding (%v)", c.config.CachePadding))
- }
- c.backend = backend
- }
- func (c *EncoderCache) SetConfig(config ml.CacheConfig) {
- if c.config != nil {
- panic("config cannot be changed after being previously set, either by the model or backend")
- }
- c.config = &config
- }
- func (c *EncoderCache) Close() {
- for _, ctx := range c.ctxs {
- ctx.Close()
- }
- }
- func (c *EncoderCache) StartForward(ctx ml.Context, batch input.Batch) error {
- // We work with the most recent image
- if len(batch.Multimodal) > 0 {
- c.curPos = batch.Positions[batch.Multimodal[len(batch.Multimodal)-1].Index]
- }
- return nil
- }
- func (c *EncoderCache) SetLayer(layer int) {
- c.curLayer = layer
- }
- func (c *EncoderCache) EncoderCached() bool {
- return c.encoderCached
- }
- func (c *EncoderCache) Get(ctx ml.Context) (ml.Tensor, ml.Tensor, ml.Tensor) {
- return c.keys[c.curLayer], c.values[c.curLayer], nil
- }
- func (c *EncoderCache) Put(ctx ml.Context, key, value ml.Tensor) {
- c.encoderPos = c.curPos
- c.encoderCached = true
- if c.config.PermutedV {
- value = value.Permute(ctx, 1, 2, 0, 3)
- }
- if _, ok := c.ctxs[c.curLayer]; !ok {
- c.ctxs[c.curLayer] = c.backend.NewContextSize(2).Layer(c.curLayer)
- }
- if _, ok := c.keys[c.curLayer]; !ok {
- c.keys[c.curLayer] = c.ctxs[c.curLayer].Empty(key.DType(), key.Shape()...)
- }
- if _, ok := c.values[c.curLayer]; !ok {
- c.values[c.curLayer] = c.ctxs[c.curLayer].Empty(value.DType(), value.Shape()...)
- }
- ctx.Forward(
- key.Copy(ctx, c.keys[c.curLayer]),
- value.Copy(ctx, c.values[c.curLayer]),
- )
- }
- func (c *EncoderCache) CopyPrefix(srcSeq, dstSeq int, len int32) {
- panic("encoder cache does not support multiple sequences")
- }
- func (c *EncoderCache) Remove(seq int, beginIndex, endIndex int32) error {
- if c.encoderPos >= beginIndex && c.encoderPos < endIndex {
- c.encoderCached = false
- }
- return nil
- }
|