llm.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "runtime"
  8. "github.com/jmorganca/ollama/api"
  9. "github.com/jmorganca/ollama/gpu"
  10. )
  11. type LLM interface {
  12. Predict(context.Context, PredictOpts, func(PredictResult)) error
  13. Embedding(context.Context, string) ([]float64, error)
  14. Encode(context.Context, string) ([]int, error)
  15. Decode(context.Context, []int) (string, error)
  16. Close()
  17. }
  18. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  19. if _, err := os.Stat(model); err != nil {
  20. return nil, err
  21. }
  22. f, err := os.Open(model)
  23. if err != nil {
  24. return nil, err
  25. }
  26. defer f.Close()
  27. ggml, err := DecodeGGML(f)
  28. if err != nil {
  29. return nil, err
  30. }
  31. if opts.NumCtx > int(ggml.NumCtx()) {
  32. slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
  33. opts.NumCtx = int(ggml.NumCtx())
  34. }
  35. if opts.NumCtx < 4 {
  36. opts.NumCtx = 4
  37. }
  38. vram, _ := gpu.CheckVRAM()
  39. size := ggml.Size
  40. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  41. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  42. // this amount is the overhead + tensors in memory
  43. // TODO: get this from the llama.cpp's graph calculations instead of
  44. // estimating it's 1/6 * kv_cache_size * num_gqa
  45. graph := int64(ggml.NumGQA()) * kv / 6
  46. info := gpu.GetGPUInfo()
  47. switch runtime.GOOS {
  48. case "darwin":
  49. if opts.NumGPU == 0 {
  50. break
  51. }
  52. if size+kv+graph > vram {
  53. slog.Info("not enough vram available, falling back to CPU only")
  54. info.Library = "cpu"
  55. info.Variant = gpu.GetCPUVariant()
  56. opts.NumGPU = 0
  57. break
  58. }
  59. // TODO: implement layer splitting on macOS
  60. opts.NumGPU = 999
  61. default:
  62. if info.Library == "cpu" {
  63. slog.Info("GPU not available, falling back to CPU")
  64. opts.NumGPU = 0
  65. break
  66. }
  67. // don't use GPU at all if no layers are loaded
  68. if opts.NumGPU == 0 {
  69. info.Library = "cpu"
  70. info.Variant = gpu.GetCPUVariant()
  71. break
  72. }
  73. // user-defined GPU count
  74. if opts.NumGPU != -1 {
  75. break
  76. }
  77. // the "main" GPU needs the most memory and determines the limit
  78. // of how many layers can be loaded. It needs to fit:
  79. // 1. the full compute graph allocation for all devices (graph)
  80. // 2. the proportional kv cache for all devices (kv * % layers)
  81. // 3. the proportional model (size * % layers / # devices)
  82. // This estimates the number of layers
  83. maxlayers := int64(ggml.NumLayers()) + 1
  84. devices := int64(info.DeviceCount)
  85. avg := vram / devices
  86. layers := maxlayers * (avg - graph) / (kv + size/devices)
  87. if layers > maxlayers {
  88. layers = maxlayers
  89. }
  90. // 1 + 2 must fit on the main gpu
  91. min := graph + kv*layers/maxlayers
  92. if layers <= 0 || min > avg {
  93. slog.Info("not enough vram available, falling back to CPU only")
  94. info.Library = "cpu"
  95. info.Variant = gpu.GetCPUVariant()
  96. opts.NumGPU = 0
  97. break
  98. }
  99. opts.NumGPU = int(layers)
  100. }
  101. opts.RopeFrequencyBase = 0.0
  102. opts.RopeFrequencyScale = 0.0
  103. return newLlmServer(info, workDir, model, adapters, projectors, opts)
  104. }
  105. // Give any native cgo implementations an opportunity to initialize
  106. func Init(workdir string) error {
  107. return nativeInit(workdir)
  108. }
  109. func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  110. dynLibs := getDynLibs(gpuInfo)
  111. // Check to see if the user has requested a specific library instead of auto-detecting
  112. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  113. if demandLib != "" {
  114. libPath := availableDynLibs[demandLib]
  115. if libPath == "" {
  116. slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
  117. } else {
  118. slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
  119. dynLibs = []string{libPath}
  120. }
  121. }
  122. // We stage into a temp directory, and if we've been idle for a while, it may have been reaped
  123. _, err := os.Stat(dynLibs[0])
  124. if err != nil {
  125. slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
  126. err = nativeInit(workDir)
  127. if err != nil {
  128. return nil, err
  129. }
  130. }
  131. err2 := fmt.Errorf("unable to locate suitable llm library")
  132. for _, dynLib := range dynLibs {
  133. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  134. if err == nil {
  135. return srv, nil
  136. }
  137. slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
  138. err2 = err
  139. }
  140. return nil, err2
  141. }