llm.go 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "runtime"
  8. "slices"
  9. "github.com/ollama/ollama/api"
  10. "github.com/ollama/ollama/gpu"
  11. )
  12. type LLM interface {
  13. Predict(context.Context, PredictOpts, func(PredictResult)) error
  14. Embedding(context.Context, string) ([]float64, error)
  15. Encode(context.Context, string) ([]int, error)
  16. Decode(context.Context, []int) (string, error)
  17. Close()
  18. }
  19. var cpuOnlyFamilies = []string{
  20. "mamba",
  21. }
  22. func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  23. if _, err := os.Stat(model); err != nil {
  24. return nil, err
  25. }
  26. f, err := os.Open(model)
  27. if err != nil {
  28. return nil, err
  29. }
  30. defer f.Close()
  31. ggml, size, err := DecodeGGML(f)
  32. if err != nil {
  33. return nil, err
  34. }
  35. if opts.NumCtx > int(ggml.KV().ContextLength()) {
  36. slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
  37. opts.NumCtx = int(ggml.KV().ContextLength())
  38. }
  39. if opts.NumCtx < 4 {
  40. opts.NumCtx = 4
  41. }
  42. vram, _ := gpu.CheckVRAM()
  43. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  44. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
  45. // this amount is the overhead + tensors in memory
  46. // TODO: get this from the llama.cpp's graph calculations instead of
  47. // estimating it's 1/6 * kv_cache_size * num_gqa
  48. graph := int64(ggml.KV().GQA()) * kv / 6
  49. if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
  50. opts.NumGPU = 0
  51. }
  52. info := gpu.GetGPUInfo()
  53. switch runtime.GOOS {
  54. case "darwin":
  55. if opts.NumGPU == 0 {
  56. break
  57. }
  58. if size+kv+graph > vram {
  59. slog.Info("not enough vram available, setting num_gpu=0")
  60. opts.NumGPU = 0
  61. break
  62. }
  63. // TODO: implement layer splitting on macOS
  64. opts.NumGPU = 999
  65. default:
  66. if info.Library == "cpu" {
  67. slog.Info("GPU not available, falling back to CPU")
  68. opts.NumGPU = 0
  69. break
  70. }
  71. // don't use GPU at all if no layers are loaded
  72. if opts.NumGPU == 0 {
  73. info.Library = "cpu"
  74. info.Variant = gpu.GetCPUVariant()
  75. break
  76. }
  77. // user-defined GPU count
  78. if opts.NumGPU != -1 {
  79. break
  80. }
  81. // the "main" GPU needs the most memory and determines the limit
  82. // of how many layers can be loaded. It needs to fit:
  83. // 1. the full compute graph allocation for all devices (graph)
  84. // 2. the proportional kv cache for all devices (kv * % layers)
  85. // 3. the proportional model (size * % layers / # devices)
  86. // This estimates the number of layers
  87. maxlayers := int64(ggml.KV().BlockCount()) + 1
  88. devices := int64(info.DeviceCount)
  89. avg := vram / devices
  90. layers := maxlayers * (avg - graph) / (kv + size/devices)
  91. if layers > maxlayers {
  92. layers = maxlayers
  93. }
  94. // 1 + 2 must fit on the main gpu
  95. min := graph + kv*layers/maxlayers
  96. if layers <= 0 || min > avg {
  97. slog.Info("not enough vram available, falling back to CPU only")
  98. info.Library = "cpu"
  99. info.Variant = gpu.GetCPUVariant()
  100. opts.NumGPU = 0
  101. break
  102. }
  103. opts.NumGPU = int(layers)
  104. }
  105. opts.RopeFrequencyBase = 0.0
  106. opts.RopeFrequencyScale = 0.0
  107. return newLlmServer(info, model, adapters, projectors, opts)
  108. }
  109. // Give any native cgo implementations an opportunity to initialize
  110. func Init() error {
  111. return nativeInit()
  112. }
  113. func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  114. dynLibs := getDynLibs(gpuInfo)
  115. // Check to see if the user has requested a specific library instead of auto-detecting
  116. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  117. if demandLib != "" {
  118. libPath := availableDynLibs[demandLib]
  119. if libPath == "" {
  120. slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
  121. } else {
  122. slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
  123. dynLibs = []string{libPath}
  124. }
  125. }
  126. // We stage into a temp directory, and if we've been idle for a while, it may have been reaped
  127. _, err := os.Stat(dynLibs[0])
  128. if err != nil {
  129. slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
  130. err = nativeInit()
  131. if err != nil {
  132. return nil, err
  133. }
  134. }
  135. err2 := fmt.Errorf("unable to locate suitable llm library")
  136. for _, dynLib := range dynLibs {
  137. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  138. if err == nil {
  139. return srv, nil
  140. }
  141. slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
  142. err2 = err
  143. }
  144. return nil, err2
  145. }