llm.go 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "runtime"
  8. "slices"
  9. "github.com/ollama/ollama/api"
  10. "github.com/ollama/ollama/gpu"
  11. )
  12. type LLM interface {
  13. Predict(context.Context, PredictOpts, func(PredictResult)) error
  14. Embedding(context.Context, string) ([]float64, error)
  15. Encode(context.Context, string) ([]int, error)
  16. Decode(context.Context, []int) (string, error)
  17. Close()
  18. }
  19. var cpuOnlyFamilies = []string{
  20. "mamba",
  21. }
  22. func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  23. if _, err := os.Stat(model); err != nil {
  24. return nil, err
  25. }
  26. f, err := os.Open(model)
  27. if err != nil {
  28. return nil, err
  29. }
  30. defer f.Close()
  31. ggml, err := DecodeGGML(f)
  32. if err != nil {
  33. return nil, err
  34. }
  35. if opts.NumCtx > int(ggml.NumCtx()) {
  36. slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
  37. opts.NumCtx = int(ggml.NumCtx())
  38. }
  39. if opts.NumCtx < 4 {
  40. opts.NumCtx = 4
  41. }
  42. vram, _ := gpu.CheckVRAM()
  43. size := ggml.Size
  44. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  45. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
  46. // this amount is the overhead + tensors in memory
  47. // TODO: get this from the llama.cpp's graph calculations instead of
  48. // estimating it's 1/6 * kv_cache_size * num_gqa
  49. graph := int64(ggml.NumGQA()) * kv / 6
  50. // certain model architectures don't support gpu inference yet
  51. if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
  52. opts.NumGPU = 0
  53. }
  54. info := gpu.GetGPUInfo()
  55. switch runtime.GOOS {
  56. case "darwin":
  57. if opts.NumGPU == 0 {
  58. break
  59. }
  60. if size+kv+graph > vram {
  61. slog.Info("not enough vram available, setting num_gpu=0")
  62. opts.NumGPU = 0
  63. break
  64. }
  65. // TODO: implement layer splitting on macOS
  66. opts.NumGPU = 999
  67. default:
  68. if info.Library == "cpu" {
  69. slog.Info("GPU not available, falling back to CPU")
  70. opts.NumGPU = 0
  71. break
  72. }
  73. // don't use GPU at all if no layers are loaded
  74. if opts.NumGPU == 0 {
  75. info.Library = "cpu"
  76. info.Variant = gpu.GetCPUVariant()
  77. break
  78. }
  79. // user-defined GPU count
  80. if opts.NumGPU != -1 {
  81. break
  82. }
  83. // the "main" GPU needs the most memory and determines the limit
  84. // of how many layers can be loaded. It needs to fit:
  85. // 1. the full compute graph allocation for all devices (graph)
  86. // 2. the proportional kv cache for all devices (kv * % layers)
  87. // 3. the proportional model (size * % layers / # devices)
  88. // This estimates the number of layers
  89. maxlayers := int64(ggml.NumLayers()) + 1
  90. devices := int64(info.DeviceCount)
  91. avg := vram / devices
  92. layers := maxlayers * (avg - graph) / (kv + size/devices)
  93. if layers > maxlayers {
  94. layers = maxlayers
  95. }
  96. // 1 + 2 must fit on the main gpu
  97. min := graph + kv*layers/maxlayers
  98. if layers <= 0 || min > avg {
  99. slog.Info("not enough vram available, falling back to CPU only")
  100. info.Library = "cpu"
  101. info.Variant = gpu.GetCPUVariant()
  102. opts.NumGPU = 0
  103. break
  104. }
  105. opts.NumGPU = int(layers)
  106. }
  107. opts.RopeFrequencyBase = 0.0
  108. opts.RopeFrequencyScale = 0.0
  109. return newLlmServer(info, model, adapters, projectors, opts)
  110. }
  111. // Give any native cgo implementations an opportunity to initialize
  112. func Init() error {
  113. return nativeInit()
  114. }
  115. func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  116. dynLibs := getDynLibs(gpuInfo)
  117. // Check to see if the user has requested a specific library instead of auto-detecting
  118. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  119. if demandLib != "" {
  120. libPath := availableDynLibs[demandLib]
  121. if libPath == "" {
  122. slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
  123. } else {
  124. slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
  125. dynLibs = []string{libPath}
  126. }
  127. }
  128. // We stage into a temp directory, and if we've been idle for a while, it may have been reaped
  129. _, err := os.Stat(dynLibs[0])
  130. if err != nil {
  131. slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
  132. err = nativeInit()
  133. if err != nil {
  134. return nil, err
  135. }
  136. }
  137. err2 := fmt.Errorf("unable to locate suitable llm library")
  138. for _, dynLib := range dynLibs {
  139. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  140. if err == nil {
  141. return srv, nil
  142. }
  143. slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
  144. err2 = err
  145. }
  146. return nil, err2
  147. }