memory_test.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. package llm
  2. import (
  3. "bytes"
  4. "fmt"
  5. "os"
  6. "testing"
  7. "github.com/stretchr/testify/assert"
  8. "github.com/stretchr/testify/require"
  9. "github.com/ollama/ollama/api"
  10. "github.com/ollama/ollama/discover"
  11. "github.com/ollama/ollama/fs/ggml"
  12. )
  13. func TestEstimateGPULayers(t *testing.T) {
  14. t.Setenv("OLLAMA_DEBUG", "1")
  15. t.Setenv("OLLAMA_KV_CACHE_TYPE", "") // Ensure default f16
  16. t.Setenv("OLLAMA_CONTEXT_LENGTH", "2048")
  17. modelName := "dummy"
  18. f, err := os.CreateTemp(t.TempDir(), modelName)
  19. require.NoError(t, err)
  20. defer f.Close()
  21. inputLayerCount := 5
  22. tensors := []ggml.Tensor{
  23. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  24. {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  25. {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  26. {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  27. {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  28. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  29. }
  30. assert.Len(t, tensors, inputLayerCount+1)
  31. err = ggml.WriteGGUF(f, ggml.KV{
  32. "general.architecture": "llama",
  33. "llama.context_length": uint32(32),
  34. "llama.embedding_length": uint32(4096),
  35. "llama.block_count": uint32(inputLayerCount),
  36. "llama.attention.head_count": uint32(32),
  37. "llama.attention.head_count_kv": uint32(32),
  38. "tokenizer.ggml.tokens": []string{" "},
  39. "tokenizer.ggml.scores": []float32{0},
  40. "tokenizer.ggml.token_type": []int32{0},
  41. }, tensors)
  42. require.NoError(t, err)
  43. ggml, err := LoadModel(f.Name(), 0)
  44. if err != nil {
  45. t.Fatal(err)
  46. }
  47. // Simple CPU scenario
  48. gpus := []discover.GpuInfo{
  49. {
  50. Library: "cpu",
  51. },
  52. }
  53. projectors := []string{}
  54. opts := api.DefaultOptions()
  55. t.Run("cpu", func(t *testing.T) {
  56. estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
  57. assert.Equal(t, 0, estimate.Layers)
  58. assert.Equal(t, uint64(0), estimate.Graph)
  59. })
  60. // derived from the dummy ggml file above
  61. graphPartialOffload := uint64(202377216)
  62. graphFullOffload := uint64(171968512)
  63. layerSize := uint64(33554436)
  64. projectorSize := uint64(0)
  65. memoryLayerOutput := uint64(4)
  66. // Dual CUDA scenario with asymmetry
  67. gpuMinimumMemory := uint64(2048)
  68. gpus = []discover.GpuInfo{
  69. {
  70. Library: "cuda",
  71. MinimumMemory: gpuMinimumMemory,
  72. },
  73. {
  74. Library: "cuda",
  75. MinimumMemory: gpuMinimumMemory,
  76. },
  77. }
  78. // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
  79. for i, s := range []struct {
  80. layer0, layer1 uint64
  81. expect0, expect1 uint64
  82. }{
  83. {1, 1, 1, 1},
  84. {2, 1, 2, 1},
  85. {2, 2, 2, 2},
  86. {1, 2, 1, 2},
  87. {3, 3, 3, 3},
  88. {4, 4, 3, 3},
  89. {6, 6, 3, 3},
  90. {0, 3, 0, 3},
  91. } {
  92. t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
  93. gpus[0].FreeMemory = 0
  94. gpus[1].FreeMemory = 0
  95. gpus[0].FreeMemory += projectorSize
  96. if s.layer0 > 0 {
  97. gpus[0].FreeMemory += memoryLayerOutput
  98. } else {
  99. gpus[1].FreeMemory += memoryLayerOutput
  100. }
  101. gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
  102. gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
  103. gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
  104. gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
  105. estimate := EstimateGPULayers(gpus, ggml, projectors, opts, 1)
  106. assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
  107. assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
  108. var layerSums uint64
  109. for _, b := range estimate.GPUSizes {
  110. layerSums += b
  111. }
  112. if estimate.Layers < inputLayerCount+1 {
  113. assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  114. assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  115. } else {
  116. assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  117. assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  118. }
  119. })
  120. }
  121. }