memory_test.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. package fileutils
  2. import (
  3. "bytes"
  4. "fmt"
  5. "os"
  6. "testing"
  7. "github.com/stretchr/testify/assert"
  8. "github.com/stretchr/testify/require"
  9. "github.com/ollama/ollama/api"
  10. "github.com/ollama/ollama/discover"
  11. )
  12. func TestEstimateGPULayers(t *testing.T) {
  13. t.Setenv("OLLAMA_DEBUG", "1")
  14. modelName := "dummy"
  15. f, err := os.CreateTemp(t.TempDir(), modelName)
  16. require.NoError(t, err)
  17. defer f.Close()
  18. inputLayerCount := 5
  19. tensors := []Tensor{
  20. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  21. {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  22. {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  23. {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  24. {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  25. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  26. }
  27. assert.Len(t, tensors, inputLayerCount+1)
  28. err = WriteGGUF(f, KV{
  29. "general.architecture": "llama",
  30. "llama.context_length": uint32(32),
  31. "llama.embedding_length": uint32(4096),
  32. "llama.block_count": uint32(inputLayerCount),
  33. "llama.attention.head_count": uint32(32),
  34. "llama.attention.head_count_kv": uint32(32),
  35. "tokenizer.ggml.tokens": []string{" "},
  36. "tokenizer.ggml.scores": []float32{0},
  37. "tokenizer.ggml.token_type": []int32{0},
  38. }, tensors)
  39. require.NoError(t, err)
  40. ggml, err := LoadModel(f.Name(), 0)
  41. if err != nil {
  42. t.Fatal(err)
  43. }
  44. // Simple CPU scenario
  45. gpus := []discover.GpuInfo{
  46. {
  47. Library: "cpu",
  48. },
  49. }
  50. projectors := []string{}
  51. opts := api.DefaultOptions()
  52. t.Run("cpu", func(t *testing.T) {
  53. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  54. assert.Equal(t, 0, estimate.Layers)
  55. assert.Equal(t, uint64(0), estimate.Graph)
  56. })
  57. // derived from the dummy ggml file above
  58. graphPartialOffload := uint64(202377216)
  59. graphFullOffload := uint64(171968512)
  60. layerSize := uint64(33554436)
  61. projectorSize := uint64(0)
  62. memoryLayerOutput := uint64(4)
  63. // Dual CUDA scenario with assymetry
  64. gpuMinimumMemory := uint64(2048)
  65. gpus = []discover.GpuInfo{
  66. {
  67. Library: "cuda",
  68. MinimumMemory: gpuMinimumMemory,
  69. },
  70. {
  71. Library: "cuda",
  72. MinimumMemory: gpuMinimumMemory,
  73. },
  74. }
  75. // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
  76. for i, s := range []struct {
  77. layer0, layer1 uint64
  78. expect0, expect1 uint64
  79. }{
  80. {1, 1, 1, 1},
  81. {2, 1, 2, 1},
  82. {2, 2, 2, 2},
  83. {1, 2, 1, 2},
  84. {3, 3, 3, 3},
  85. {4, 4, 3, 3},
  86. {6, 6, 3, 3},
  87. {0, 3, 0, 3},
  88. } {
  89. t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
  90. gpus[0].FreeMemory = 0
  91. gpus[1].FreeMemory = 0
  92. gpus[0].FreeMemory += projectorSize
  93. if s.layer0 > 0 {
  94. gpus[0].FreeMemory += memoryLayerOutput
  95. } else {
  96. gpus[1].FreeMemory += memoryLayerOutput
  97. }
  98. gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
  99. gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
  100. gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
  101. gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
  102. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  103. assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
  104. assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
  105. var layerSums uint64
  106. for _, b := range estimate.GPUSizes {
  107. layerSums += b
  108. }
  109. if estimate.Layers < inputLayerCount+1 {
  110. assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  111. assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  112. } else {
  113. assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  114. assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  115. }
  116. })
  117. }
  118. }