memory_test.go 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. package llm
  2. import (
  3. "bytes"
  4. "encoding/binary"
  5. "fmt"
  6. "os"
  7. "testing"
  8. "github.com/ollama/ollama/api"
  9. "github.com/ollama/ollama/envconfig"
  10. "github.com/ollama/ollama/gpu"
  11. "github.com/stretchr/testify/assert"
  12. "github.com/stretchr/testify/require"
  13. )
  14. func TestEstimateGPULayers(t *testing.T) {
  15. envconfig.Debug = true
  16. modelName := "dummy"
  17. f, err := os.CreateTemp(t.TempDir(), modelName)
  18. assert.Nil(t, err)
  19. defer f.Close()
  20. gguf := NewGGUFV3(binary.LittleEndian)
  21. inputLayerCount := 5
  22. tensors := []Tensor{
  23. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  24. {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  25. {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  26. {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  27. {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  28. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  29. }
  30. assert.Equal(t, inputLayerCount+1, len(tensors))
  31. err = gguf.Encode(f, KV{
  32. "general.architecture": "llama",
  33. "general.name": "name",
  34. "llama.context_length": uint32(32),
  35. "llama.embedding_length": uint32(4096),
  36. "llama.block_count": uint32(inputLayerCount),
  37. "llama.attention.head_count": uint32(32),
  38. "llama.attention.head_count_kv": uint32(32),
  39. "tokenizer.ggml.tokens": []string{" "},
  40. "tokenizer.ggml.scores": []float32{0},
  41. "tokenizer.ggml.token_type": []int32{0},
  42. }, tensors)
  43. require.NoError(t, err)
  44. ggml, err := LoadModel(f.Name())
  45. require.NoError(t, err)
  46. // Simple CPU scenario
  47. gpus := []gpu.GpuInfo{
  48. {
  49. Library: "cpu",
  50. },
  51. }
  52. projectors := []string{}
  53. opts := api.DefaultOptions()
  54. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  55. assert.Equal(t, 0, estimate.Layers)
  56. assert.Equal(t, uint64(0), estimate.Graph)
  57. // derived from the dummy ggml file above
  58. graphPartialOffload := uint64(202377216)
  59. graphFullOffload := uint64(171968512)
  60. layerSize := uint64(33554436)
  61. projectorSize := uint64(0)
  62. memoryLayerOutput := uint64(4)
  63. // Dual CUDA scenario with assymetry
  64. gpuMinimumMemory := uint64(2048)
  65. gpus = []gpu.GpuInfo{
  66. {
  67. Library: "cuda",
  68. MinimumMemory: gpuMinimumMemory,
  69. },
  70. {
  71. Library: "cuda",
  72. MinimumMemory: gpuMinimumMemory,
  73. },
  74. }
  75. // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
  76. for i, s := range [][]uint64{
  77. {1, 1, 1, 1},
  78. {2, 1, 2, 1},
  79. {2, 2, 2, 2},
  80. {1, 2, 1, 2},
  81. {3, 3, 3, 3},
  82. {4, 4, 3, 3},
  83. {6, 6, 3, 3},
  84. {0, 3, 0, 3},
  85. } {
  86. gpus[0].FreeMemory = 0
  87. gpus[1].FreeMemory = 0
  88. gpus[0].FreeMemory += projectorSize + memoryLayerOutput
  89. gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
  90. gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
  91. gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
  92. gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
  93. estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
  94. assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
  95. assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
  96. var layerSums uint64
  97. for _, b := range estimate.GPUSizes {
  98. layerSums += b
  99. }
  100. if estimate.Layers < inputLayerCount+1 {
  101. assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  102. assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  103. } else {
  104. assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
  105. assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
  106. }
  107. }
  108. }