memory_test.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. package llm
  2. import (
  3. "bytes"
  4. "fmt"
  5. "os"
  6. "testing"
  7. "github.com/stretchr/testify/assert"
  8. "github.com/stretchr/testify/require"
  9. "github.com/ollama/ollama/api"
  10. "github.com/ollama/ollama/gpu"
  11. )
  12. func TestEstimateGPULayers(t *testing.T) {
  13. t.Setenv("OLLAMA_DEBUG", "1")
  14. modelName := "dummy"
  15. f, err := os.CreateTemp(t.TempDir(), modelName)
  16. require.NoError(t, err)
  17. defer f.Close()
  18. inputLayerCount := 5
  19. tensors := []Tensor{
  20. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  21. {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  22. {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  23. {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  24. {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  25. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  26. }
  27. assert.Len(t, tensors, inputLayerCount+1)
  28. err = WriteGGUF(f, KV{
  29. "general.architecture": "llama",
  30. "general.name": "name",
  31. "llama.context_length": uint32(32),
  32. "llama.embedding_length": uint32(4096),
  33. "llama.block_count": uint32(inputLayerCount),
  34. "llama.attention.head_count": uint32(32),
  35. "llama.attention.head_count_kv": uint32(32),
  36. "tokenizer.ggml.tokens": []string{" "},
  37. "tokenizer.ggml.scores": []float32{0},
  38. "tokenizer.ggml.token_type": []int32{0},
  39. }, tensors)
  40. require.NoError(t, err)
  41. ggml, err := LoadModel(f.Name(), 0)
  42. if err != nil {
  43. t.Fatal(err)
  44. }
  45. // Simple CPU scenario
  46. gpus := []gpu.GpuInfo{
  47. {
  48. Library: "cpu",
  49. },
  50. }
  51. projectors := []string{}
  52. opts := api.DefaultOptions()
  53. t.Run("cpu", func(t *testing.T) {
  54. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  55. assert.Equal(t, 0, estimate.Layers)
  56. assert.Equal(t, uint64(0), estimate.Graph)
  57. })
  58. // derived from the dummy ggml file above
  59. graphPartialOffload := uint64(202377216)
  60. graphFullOffload := uint64(171968512)
  61. layerSize := uint64(33554436)
  62. projectorSize := uint64(0)
  63. memoryLayerOutput := uint64(4)
  64. // Dual CUDA scenario with assymetry
  65. gpuMinimumMemory := uint64(2048)
  66. gpus = []gpu.GpuInfo{
  67. {
  68. Library: "cuda",
  69. MinimumMemory: gpuMinimumMemory,
  70. },
  71. {
  72. Library: "cuda",
  73. MinimumMemory: gpuMinimumMemory,
  74. },
  75. }
  76. // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
  77. for i, s := range []struct {
  78. layer0, layer1 uint64
  79. expect0, expect1 uint64
  80. }{
  81. {1, 1, 1, 1},
  82. {2, 1, 2, 1},
  83. {2, 2, 2, 2},
  84. {1, 2, 1, 2},
  85. {3, 3, 3, 3},
  86. {4, 4, 3, 3},
  87. {6, 6, 3, 3},
  88. {0, 3, 0, 3},
  89. } {
  90. t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
  91. gpus[0].FreeMemory = 0
  92. gpus[1].FreeMemory = 0
  93. gpus[0].FreeMemory += projectorSize
  94. if s.layer0 > 0 {
  95. gpus[0].FreeMemory += memoryLayerOutput
  96. } else {
  97. gpus[1].FreeMemory += memoryLayerOutput
  98. }
  99. gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
  100. gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
  101. gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
  102. gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
  103. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  104. assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
  105. assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
  106. var layerSums uint64
  107. for _, b := range estimate.GPUSizes {
  108. layerSums += b
  109. }
  110. if estimate.Layers < inputLayerCount+1 {
  111. assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  112. assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  113. } else {
  114. assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  115. assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  116. }
  117. })
  118. }
  119. }