llm.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "runtime"
  8. "time"
  9. "github.com/jmorganca/ollama/api"
  10. "github.com/jmorganca/ollama/gpu"
  11. )
  12. type LLM interface {
  13. Predict(context.Context, PredictOpts, func(PredictResult)) error
  14. Embedding(context.Context, string) ([]float64, error)
  15. Encode(context.Context, string) ([]int, error)
  16. Decode(context.Context, []int) (string, error)
  17. Close()
  18. }
  19. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  20. if _, err := os.Stat(model); err != nil {
  21. return nil, err
  22. }
  23. f, err := os.Open(model)
  24. if err != nil {
  25. return nil, err
  26. }
  27. defer f.Close()
  28. ggml, err := DecodeGGML(f)
  29. if err != nil {
  30. return nil, err
  31. }
  32. if opts.NumCtx > int(ggml.NumCtx()) {
  33. slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
  34. opts.NumCtx = int(ggml.NumCtx())
  35. }
  36. if opts.NumCtx < 4 {
  37. opts.NumCtx = 4
  38. }
  39. vram, _ := gpu.CheckVRAM()
  40. size := ggml.Size
  41. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  42. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  43. // this amount is the overhead + tensors in memory
  44. // TODO: get this from the llama.cpp's graph calculations instead of
  45. // estimating it's 1/6 * kv_cache_size * num_gqa
  46. graph := int64(ggml.NumGQA()) * kv / 6
  47. info := gpu.GetGPUInfo()
  48. switch runtime.GOOS {
  49. case "darwin":
  50. if opts.NumGPU == 0 {
  51. break
  52. }
  53. if size+kv+graph > vram {
  54. slog.Info("not enough vram available, falling back to CPU only")
  55. info.Library = "cpu"
  56. info.Variant = gpu.GetCPUVariant()
  57. opts.NumGPU = 0
  58. break
  59. }
  60. // TODO: implement layer splitting on macOS
  61. opts.NumGPU = 999
  62. default:
  63. if info.Library == "cpu" {
  64. slog.Info("GPU not available, falling back to CPU")
  65. opts.NumGPU = 0
  66. break
  67. }
  68. // don't use GPU at all if no layers are loaded
  69. if opts.NumGPU == 0 {
  70. info.Library = "cpu"
  71. info.Variant = gpu.GetCPUVariant()
  72. break
  73. }
  74. // user-defined GPU count
  75. if opts.NumGPU != -1 {
  76. break
  77. }
  78. // the "main" GPU needs the most memory and determines the limit
  79. // of how many layers can be loaded. It needs to fit:
  80. // 1. the full compute graph allocation for all devices (graph)
  81. // 2. the proportional kv cache for all devices (kv * % layers)
  82. // 3. the proportional model (size * % layers / # devices)
  83. // This estimates the number of layers
  84. maxlayers := int64(ggml.NumLayers()) + 1
  85. devices := int64(info.DeviceCount)
  86. avg := vram / devices
  87. layers := maxlayers * (avg - graph) / (kv + size/devices)
  88. if layers > maxlayers {
  89. layers = maxlayers
  90. }
  91. // 1 + 2 must fit on the main gpu
  92. min := graph + kv*layers/maxlayers
  93. if layers <= 0 || min > avg {
  94. slog.Info("not enough vram available, falling back to CPU only")
  95. info.Library = "cpu"
  96. info.Variant = gpu.GetCPUVariant()
  97. opts.NumGPU = 0
  98. break
  99. }
  100. opts.NumGPU = int(layers)
  101. }
  102. opts.RopeFrequencyBase = 0.0
  103. opts.RopeFrequencyScale = 0.0
  104. return newLlmServer(info, workDir, model, adapters, projectors, opts)
  105. }
  106. // Give any native cgo implementations an opportunity to initialize
  107. func Init(workdir string) error {
  108. return nativeInit(workdir)
  109. }
  110. func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  111. dynLibs := getDynLibs(gpuInfo)
  112. // Check to see if the user has requested a specific library instead of auto-detecting
  113. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  114. if demandLib != "" {
  115. libPath := availableDynLibs[demandLib]
  116. if libPath == "" {
  117. slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
  118. } else {
  119. slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
  120. dynLibs = []string{libPath}
  121. }
  122. }
  123. // We stage into a temp directory, and if we've been idle for a while, it may have been reaped
  124. _, err := os.Stat(dynLibs[0])
  125. if err != nil {
  126. slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
  127. err = nativeInit(workDir)
  128. if err != nil {
  129. return nil, err
  130. }
  131. }
  132. err2 := fmt.Errorf("unable to locate suitable llm library")
  133. for _, dynLib := range dynLibs {
  134. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  135. if err == nil {
  136. return srv, nil
  137. }
  138. slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
  139. err2 = err
  140. }
  141. return nil, err2
  142. }
  143. func parseDurationMs(ms float64) time.Duration {
  144. dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
  145. if err != nil {
  146. panic(err)
  147. }
  148. return dur
  149. }