cache.go 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. package cache
  2. import (
  3. "errors"
  4. "fmt"
  5. "log/slog"
  6. "math"
  7. "slices"
  8. "github.com/ollama/ollama/ml"
  9. )
  10. var ErrNotSupported = errors.New("model does not support operation")
  11. type Cache interface {
  12. // used by model implementations
  13. Sub(i int) Cache
  14. Put(ctx ml.Context, key, value ml.Tensor) (ml.Tensor, ml.Tensor, ml.Tensor)
  15. // cache management
  16. Close()
  17. StartForward(ctx ml.Context, seqs []int) error
  18. CopyPrefix(srcSeq, dstSeq int, len int)
  19. Remove(seq int, beginIndex, endIndex int) error
  20. }
  21. type Causal struct {
  22. DType ml.DType
  23. Capacity int
  24. // current forward pass
  25. curLayer int
  26. curPos int
  27. curBatchSize int
  28. curMask ml.Tensor
  29. curCellRange cellRange
  30. // metadata
  31. cells []cacheCell
  32. seqNextPos map[int]int
  33. cellRanges map[int]cellRange
  34. // cache data storage
  35. backend ml.Backend
  36. cacheCtx ml.Context
  37. keys, values []ml.Tensor
  38. }
  39. type seqCell struct {
  40. seq int
  41. pos int
  42. }
  43. type cacheCell struct {
  44. sequences []seqCell
  45. }
  46. type cellRange struct {
  47. min int
  48. max int
  49. }
  50. func (cell cacheCell) findSeq(seq int) *seqCell {
  51. for i := range cell.sequences {
  52. if cell.sequences[i].seq == seq {
  53. return &cell.sequences[i]
  54. }
  55. }
  56. return nil
  57. }
  58. func NewCausalCache(backend ml.Backend, capacity int, dtype ml.DType) Cache {
  59. return &Causal{
  60. Capacity: capacity,
  61. DType: dtype,
  62. cells: make([]cacheCell, capacity),
  63. seqNextPos: make(map[int]int),
  64. cellRanges: make(map[int]cellRange),
  65. backend: backend,
  66. // TODO(jessegross): This context is not sized appropriately
  67. cacheCtx: backend.NewContext(),
  68. }
  69. }
  70. func (c *Causal) Close() {
  71. c.cacheCtx.Close()
  72. }
  73. var ErrKvCacheFull = errors.New("could not find a kv cache slot")
  74. func (c *Causal) StartForward(ctx ml.Context, seqs []int) error {
  75. c.curBatchSize = len(seqs)
  76. var err error
  77. c.curPos, err = c.findStartPos()
  78. if errors.Is(err, ErrKvCacheFull) {
  79. c.defrag()
  80. c.curPos, err = c.findStartPos()
  81. }
  82. if err != nil {
  83. return err
  84. }
  85. // TODO(jessegross): There should be a better way to do this
  86. origSeq := make(map[int]int)
  87. for k, v := range c.seqNextPos {
  88. origSeq[k] = v
  89. }
  90. c.curCellRange = newRange()
  91. for i, seq := range seqs {
  92. c.cells[c.curPos+i] = cacheCell{sequences: []seqCell{{seq: seq, pos: c.seqNextPos[seq]}}}
  93. c.seqNextPos[seq]++
  94. ranges := c.cellRanges[seq]
  95. if c.curPos+i > ranges.max {
  96. ranges.max = c.curPos + i
  97. }
  98. if ranges.max > c.curCellRange.max {
  99. c.curCellRange.max = ranges.max
  100. }
  101. if c.curPos+i < ranges.min {
  102. ranges.min = c.curPos + i
  103. }
  104. if ranges.min < c.curCellRange.min {
  105. c.curCellRange.min = ranges.min
  106. }
  107. c.cellRanges[seq] = ranges
  108. }
  109. c.curMask, err = c.buildMask(ctx, origSeq, seqs)
  110. return err
  111. }
  112. func newRange() cellRange {
  113. return cellRange{
  114. min: math.MaxInt,
  115. max: 0,
  116. }
  117. }
  118. func (c *Causal) findStartPos() (int, error) {
  119. var start, count int
  120. for i := range c.cells {
  121. if len(c.cells[i].sequences) == 0 {
  122. count++
  123. if count >= c.curBatchSize {
  124. return start, nil
  125. }
  126. } else {
  127. start = i + 1
  128. count = 0
  129. }
  130. }
  131. return 0, fmt.Errorf("%w (length: %v)", ErrKvCacheFull, c.Capacity)
  132. }
  133. func (c *Causal) buildMask(ctx ml.Context, origSeq map[int]int, seqs []int) (ml.Tensor, error) {
  134. // TODO(jessegross): This makes a number of simplifications such as no padding
  135. len := c.curCellRange.max - c.curCellRange.min
  136. mask := make([]float32, c.curBatchSize*len)
  137. for i := range c.curBatchSize {
  138. for j := c.curCellRange.min; j < c.curCellRange.max; j++ {
  139. cellSeq := c.cells[j].findSeq(seqs[i])
  140. if cellSeq == nil || cellSeq.pos > origSeq[seqs[i]]+i {
  141. mask[i*len+(j-c.curCellRange.min)] = float32(math.Inf(-1))
  142. }
  143. }
  144. }
  145. return ctx.FromFloatSlice(mask, len, c.curBatchSize)
  146. }
  147. func moveCell(ctx ml.Context, objs []ml.Tensor, src, dst, len int) {
  148. for _, obj := range objs {
  149. srcView := obj.View(ctx, int(obj.Stride(2))*src, int(obj.Dim(0)*obj.Dim(1))*len)
  150. dstView := obj.View(ctx, int(obj.Stride(2))*dst, int(obj.Dim(0)*obj.Dim(1))*len)
  151. ctx.Forward(srcView.Copy(ctx, dstView))
  152. }
  153. }
  154. func (c *Causal) defrag() {
  155. slog.Debug("defragmenting kv cache")
  156. // Defrag strategy:
  157. // - Search for empty holes at the beginning of the cache,
  158. // filling them with active data starting at the end
  159. // - If there are contiguous elements that need to be moved,
  160. // combine them into a single operation by holding new moves
  161. // until we see the next one is non-contiguous
  162. // - Fill up the context with the maximum number of operations it
  163. // can hold then compute that and continue with a new context
  164. // TODO(jessegross):
  165. // - Need to size the context and compute maxMoves correctly
  166. // - Just compacts, doesn't optimize placement
  167. maxMoves := 8192 / (6 * len(c.keys))
  168. ctx := c.backend.NewContext()
  169. moves := 0
  170. var pendingSrc, pendingDst, pendingLen int
  171. for dst := range c.cells {
  172. if len(c.cells[dst].sequences) == 0 {
  173. for src := len(c.cells) - 1; src > dst; src-- {
  174. if len(c.cells[src].sequences) != 0 {
  175. c.cells[dst] = c.cells[src]
  176. c.cells[src] = cacheCell{}
  177. if pendingLen > 0 {
  178. if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
  179. pendingSrc = src
  180. pendingLen++
  181. break
  182. } else {
  183. moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
  184. moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
  185. moves++
  186. }
  187. }
  188. pendingSrc = src
  189. pendingDst = dst
  190. pendingLen = 1
  191. break
  192. }
  193. }
  194. }
  195. if moves >= maxMoves {
  196. ctx.Compute(nil)
  197. ctx.Close()
  198. ctx = c.backend.NewContext()
  199. moves = 0
  200. }
  201. }
  202. if pendingLen > 0 {
  203. moveCell(ctx, c.keys, pendingSrc, pendingDst, pendingLen)
  204. moveCell(ctx, c.values, pendingSrc, pendingDst, pendingLen)
  205. moves++
  206. }
  207. if moves > 0 {
  208. ctx.Compute(nil)
  209. }
  210. ctx.Close()
  211. for seq := range c.cellRanges {
  212. seqRange := newRange()
  213. for i, cell := range c.cells {
  214. if cell.findSeq(seq) != nil {
  215. if i < seqRange.min {
  216. seqRange.min = i
  217. }
  218. if i > seqRange.max {
  219. seqRange.max = i
  220. }
  221. }
  222. }
  223. c.cellRanges[seq] = seqRange
  224. }
  225. }
  226. func (c *Causal) Sub(i int) Cache {
  227. if i >= len(c.keys) {
  228. c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
  229. c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
  230. }
  231. c.curLayer = i
  232. return c
  233. }
  234. func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) (ml.Tensor, ml.Tensor, ml.Tensor) {
  235. if c.curBatchSize != int(key.Dim(2)) {
  236. panic(fmt.Errorf("inconsistent batch sizes (layer: %v, batch size: %v layer batch size: %v)", c.curLayer, c.curBatchSize, int(key.Dim(2))))
  237. }
  238. if c.keys[c.curLayer] == nil || c.values[c.curLayer] == nil {
  239. c.keys[c.curLayer] = c.cacheCtx.Zeros(c.DType, key.Dim(0), key.Dim(1), int64(c.Capacity))
  240. c.values[c.curLayer] = c.cacheCtx.Zeros(c.DType, value.Dim(0), value.Dim(1), int64(c.Capacity))
  241. }
  242. ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, int(key.Stride(2))*c.curPos, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
  243. ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, int(value.Stride(2))*c.curPos, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
  244. len := c.curCellRange.max - c.curCellRange.min
  245. key = c.keys[c.curLayer].View(ctx, int(key.Stride(2))*c.curCellRange.min,
  246. int(key.Dim(0)), int(key.Stride(1)),
  247. int(key.Dim(1)), int(key.Stride(2)),
  248. len,
  249. )
  250. value = c.values[c.curLayer].View(ctx, int(key.Stride(2))*c.curCellRange.min,
  251. int(value.Dim(0)), int(value.Stride(1)),
  252. int(value.Dim(1)), int(value.Stride(2)),
  253. len,
  254. )
  255. return key, value, c.curMask
  256. }
  257. func (c *Causal) CopyPrefix(srcSeq, dstSeq int, len int) {
  258. seqRange := newRange()
  259. for i := range c.cells {
  260. srcCellSeq := c.cells[i].findSeq(srcSeq)
  261. dstCellSeq := c.cells[i].findSeq(dstSeq)
  262. if dstCellSeq != nil {
  263. c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == dstSeq })
  264. }
  265. if srcCellSeq != nil && srcCellSeq.pos < len {
  266. c.cells[i].sequences = append(c.cells[i].sequences, seqCell{seq: dstSeq, pos: srcCellSeq.pos})
  267. if i < seqRange.min {
  268. seqRange.min = i
  269. }
  270. if i > seqRange.max {
  271. seqRange.max = i
  272. }
  273. }
  274. }
  275. c.cellRanges[dstSeq] = seqRange
  276. c.seqNextPos[dstSeq] = len
  277. }
  278. func (c *Causal) shift(seq int, beginIndex, endIndex, offset int) error {
  279. panic("Shift not yet implemented")
  280. }
  281. func (c *Causal) Remove(seq int, beginIndex, endIndex int) error {
  282. endIndex = min(endIndex, c.seqNextPos[seq])
  283. offset := beginIndex - endIndex
  284. seqRange := newRange()
  285. for i := range c.cells {
  286. cellSeq := c.cells[i].findSeq(seq)
  287. if cellSeq != nil {
  288. if cellSeq.pos >= beginIndex && cellSeq.pos < endIndex {
  289. c.cells[i].sequences = slices.DeleteFunc(c.cells[i].sequences, func(s seqCell) bool { return s.seq == seq })
  290. } else {
  291. if cellSeq.pos >= endIndex {
  292. cellSeq.pos += offset
  293. }
  294. if i < seqRange.min {
  295. seqRange.min = i
  296. }
  297. if i > seqRange.max {
  298. seqRange.max = i
  299. }
  300. }
  301. }
  302. }
  303. if endIndex != c.seqNextPos[seq] {
  304. err := c.shift(seq, endIndex, c.seqNextPos[seq], offset)
  305. if err != nil {
  306. return err
  307. }
  308. }
  309. c.cellRanges[seq] = seqRange
  310. c.seqNextPos[seq] += offset
  311. return nil
  312. }