llm.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log"
  6. "os"
  7. "runtime"
  8. "github.com/jmorganca/ollama/api"
  9. "github.com/jmorganca/ollama/gpu"
  10. )
  11. type LLM interface {
  12. Predict(context.Context, PredictOpts, func(PredictResult)) error
  13. Embedding(context.Context, string) ([]float64, error)
  14. Encode(context.Context, string) ([]int, error)
  15. Decode(context.Context, []int) (string, error)
  16. Close()
  17. }
  18. var AvailableShims = map[string]string{}
  19. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  20. if _, err := os.Stat(model); err != nil {
  21. return nil, err
  22. }
  23. f, err := os.Open(model)
  24. if err != nil {
  25. return nil, err
  26. }
  27. defer f.Close()
  28. ggml, err := DecodeGGML(f)
  29. if err != nil {
  30. return nil, err
  31. }
  32. if opts.NumCtx < 4 {
  33. opts.NumCtx = 4
  34. }
  35. fmt.Println("size", ggml.Size)
  36. fmt.Println("filetype", ggml.FileType())
  37. fmt.Println("architecture", ggml.ModelFamily())
  38. fmt.Println("type", ggml.ModelType())
  39. fmt.Println("name", ggml.Name())
  40. fmt.Println("embd", ggml.NumEmbed())
  41. fmt.Println("head", ggml.NumHead())
  42. fmt.Println("head_kv", ggml.NumHeadKv())
  43. fmt.Println("gqa", ggml.NumGQA())
  44. available, _ := gpu.CheckVRAM()
  45. // For now assume filesize = model size
  46. // TODO: use actual model size
  47. requiredModel := ggml.Size
  48. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  49. requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  50. // this amount is the overhead + tensors in memory
  51. // TODO: get this from the llama.cpp's graph calcluations instead of
  52. // estimating it's 1/6 * kv_cache_size * num_gqa
  53. requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
  54. requiredTotal := requiredModel + requiredKv + requiredAlloc
  55. log.Println("system memory bytes:", available)
  56. log.Println("required model bytes:", requiredModel)
  57. log.Println("required kv bytes:", requiredKv)
  58. log.Println("required alloc bytes:", requiredAlloc)
  59. log.Println("required total bytes:", requiredTotal)
  60. info := gpu.GetGPUInfo()
  61. library := info.Library
  62. if opts.NumGPU == -1 {
  63. // default to offloading all layers
  64. opts.NumGPU = int(ggml.NumLayers()) + 1
  65. }
  66. // decide how many layers to put on the GPU
  67. if opts.NumGPU > 0 {
  68. switch runtime.GOOS {
  69. case "darwin":
  70. if requiredTotal > available {
  71. log.Println("not enough vram available, falling back to CPU only")
  72. opts.NumGPU = 0
  73. }
  74. default:
  75. if library == "cpu" || library == "default" {
  76. opts.NumGPU = 0
  77. break
  78. }
  79. // no offloading required
  80. if requiredTotal <= available {
  81. break
  82. }
  83. // requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
  84. if requiredAlloc > available {
  85. log.Printf("not enough vram available, falling back to CPU only")
  86. library = "cpu"
  87. opts.NumGPU = 0
  88. break
  89. }
  90. available -= requiredAlloc
  91. // fill remaining vram with layers
  92. log.Println("splitting", available, "of available memory bytes into layers")
  93. bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
  94. log.Println("bytes per layer:", bytesPerLayer)
  95. layers := available / bytesPerLayer
  96. log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
  97. if layers < int64(opts.NumGPU) {
  98. opts.NumGPU = int(layers)
  99. }
  100. }
  101. }
  102. opts.NumGQA = 0
  103. opts.RopeFrequencyBase = 0.0
  104. opts.RopeFrequencyScale = 0.0
  105. return newLlmServer(library, model, adapters, projectors, opts)
  106. }
  107. // Give any native cgo implementations an opportunity to initialize
  108. func Init(workdir string) error {
  109. return nativeInit(workdir)
  110. }
  111. func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
  112. if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
  113. srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
  114. if err == nil {
  115. return srv, nil
  116. }
  117. log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
  118. // TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
  119. }
  120. return newDefaultExtServer(model, adapters, projectors, opts)
  121. }