memory_test.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. package llm
  2. import (
  3. "bytes"
  4. "encoding/binary"
  5. "fmt"
  6. "os"
  7. "testing"
  8. "github.com/ollama/ollama/api"
  9. "github.com/ollama/ollama/envconfig"
  10. "github.com/ollama/ollama/gpu"
  11. "github.com/stretchr/testify/assert"
  12. "github.com/stretchr/testify/require"
  13. )
  14. func TestEstimateGPULayers(t *testing.T) {
  15. envconfig.Debug = true
  16. modelName := "dummy"
  17. f, err := os.CreateTemp(t.TempDir(), modelName)
  18. require.NoError(t, err)
  19. defer f.Close()
  20. gguf := NewGGUFV3(binary.LittleEndian)
  21. inputLayerCount := 5
  22. tensors := []Tensor{
  23. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  24. {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  25. {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  26. {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  27. {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  28. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  29. }
  30. assert.Len(t, tensors, inputLayerCount+1)
  31. err = gguf.Encode(f, KV{
  32. "general.architecture": "llama",
  33. "general.name": "name",
  34. "llama.context_length": uint32(32),
  35. "llama.embedding_length": uint32(4096),
  36. "llama.block_count": uint32(inputLayerCount),
  37. "llama.attention.head_count": uint32(32),
  38. "llama.attention.head_count_kv": uint32(32),
  39. "tokenizer.ggml.tokens": []string{" "},
  40. "tokenizer.ggml.scores": []float32{0},
  41. "tokenizer.ggml.token_type": []int32{0},
  42. }, tensors)
  43. require.NoError(t, err)
  44. ggml, err := LoadModel(f.Name())
  45. require.NoError(t, err)
  46. // Simple CPU scenario
  47. gpus := []gpu.GpuInfo{
  48. {
  49. Library: "cpu",
  50. },
  51. }
  52. projectors := []string{}
  53. opts := api.DefaultOptions()
  54. t.Run("cpu", func(t *testing.T) {
  55. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  56. assert.Equal(t, 0, estimate.Layers)
  57. assert.Equal(t, uint64(0), estimate.Graph)
  58. })
  59. // derived from the dummy ggml file above
  60. graphPartialOffload := uint64(202377216)
  61. graphFullOffload := uint64(171968512)
  62. layerSize := uint64(33554436)
  63. projectorSize := uint64(0)
  64. memoryLayerOutput := uint64(4)
  65. // Dual CUDA scenario with assymetry
  66. gpuMinimumMemory := uint64(2048)
  67. gpus = []gpu.GpuInfo{
  68. {
  69. Library: "cuda",
  70. MinimumMemory: gpuMinimumMemory,
  71. },
  72. {
  73. Library: "cuda",
  74. MinimumMemory: gpuMinimumMemory,
  75. },
  76. }
  77. // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
  78. for i, s := range []struct {
  79. layer0, layer1 uint64
  80. expect0, expect1 uint64
  81. }{
  82. {1, 1, 1, 1},
  83. {2, 1, 2, 1},
  84. {2, 2, 2, 2},
  85. {1, 2, 1, 2},
  86. {3, 3, 3, 3},
  87. {4, 4, 3, 3},
  88. {6, 6, 3, 3},
  89. {0, 3, 0, 3},
  90. } {
  91. t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
  92. gpus[0].FreeMemory = 0
  93. gpus[1].FreeMemory = 0
  94. gpus[0].FreeMemory += projectorSize
  95. if s.layer0 > 0 {
  96. gpus[0].FreeMemory += memoryLayerOutput
  97. } else {
  98. gpus[1].FreeMemory += memoryLayerOutput
  99. }
  100. gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
  101. gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
  102. gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
  103. gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
  104. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  105. assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
  106. assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
  107. var layerSums uint64
  108. for _, b := range estimate.GPUSizes {
  109. layerSums += b
  110. }
  111. if estimate.Layers < inputLayerCount+1 {
  112. assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  113. assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  114. } else {
  115. assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  116. assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  117. }
  118. })
  119. }
  120. }