llm.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "runtime"
  8. "github.com/jmorganca/ollama/api"
  9. "github.com/jmorganca/ollama/gpu"
  10. )
  11. type LLM interface {
  12. Predict(context.Context, PredictOpts, func(PredictResult)) error
  13. Embedding(context.Context, string) ([]float64, error)
  14. Encode(context.Context, string) ([]int, error)
  15. Decode(context.Context, []int) (string, error)
  16. Close()
  17. }
  18. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  19. if _, err := os.Stat(model); err != nil {
  20. return nil, err
  21. }
  22. f, err := os.Open(model)
  23. if err != nil {
  24. return nil, err
  25. }
  26. defer f.Close()
  27. ggml, err := DecodeGGML(f)
  28. if err != nil {
  29. return nil, err
  30. }
  31. if opts.NumCtx > int(ggml.NumCtx()) {
  32. slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
  33. opts.NumCtx = int(ggml.NumCtx())
  34. }
  35. if opts.NumCtx < 4 {
  36. opts.NumCtx = 4
  37. }
  38. vram, _ := gpu.CheckVRAM()
  39. size := ggml.Size
  40. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  41. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  42. // this amount is the overhead + tensors in memory
  43. // TODO: get this from the llama.cpp's graph calculations instead of
  44. // estimating it's 1/6 * kv_cache_size * num_gqa
  45. graph := int64(ggml.NumGQA()) * kv / 6
  46. info := gpu.GetGPUInfo()
  47. switch runtime.GOOS {
  48. case "darwin":
  49. if opts.NumGPU == 0 {
  50. break
  51. }
  52. if size+kv+graph > vram {
  53. slog.Info("not enough vram available, falling back to CPU only")
  54. info.Library = "cpu"
  55. info.Variant = gpu.GetCPUVariant()
  56. opts.NumGPU = 0
  57. break
  58. }
  59. opts.NumGPU = 1
  60. default:
  61. if info.Library == "cpu" {
  62. slog.Info("GPU not available, falling back to CPU")
  63. opts.NumGPU = 0
  64. break
  65. }
  66. // don't use GPU at all if no layers are loaded
  67. if opts.NumGPU == 0 {
  68. info.Library = "cpu"
  69. info.Variant = gpu.GetCPUVariant()
  70. break
  71. }
  72. // user-defined GPU count
  73. if opts.NumGPU != -1 {
  74. break
  75. }
  76. // the "main" GPU needs the most memory and determines the limit
  77. // of how many layers can be loaded. It needs to fit:
  78. // 1. the full compute graph allocation for all devices (graph)
  79. // 2. the proportional kv cache for all devices (kv * % layers)
  80. // 3. the proportional model (size * % layers / # devices)
  81. // This estimates the number of layers
  82. maxlayers := int64(ggml.NumLayers()) + 1
  83. devices := int64(info.DeviceCount)
  84. avg := vram / devices
  85. layers := maxlayers * (avg - graph) / (kv + size/devices)
  86. if layers > maxlayers {
  87. layers = maxlayers
  88. }
  89. // 1 + 2 must fit on the main gpu
  90. min := graph + kv*layers/maxlayers
  91. if layers <= 0 || min > avg {
  92. slog.Info("not enough vram available, falling back to CPU only")
  93. info.Library = "cpu"
  94. info.Variant = gpu.GetCPUVariant()
  95. opts.NumGPU = 0
  96. break
  97. }
  98. opts.NumGPU = int(layers)
  99. }
  100. opts.RopeFrequencyBase = 0.0
  101. opts.RopeFrequencyScale = 0.0
  102. return newLlmServer(info, model, adapters, projectors, opts)
  103. }
  104. // Give any native cgo implementations an opportunity to initialize
  105. func Init(workdir string) error {
  106. return nativeInit(workdir)
  107. }
  108. func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  109. dynLibs := getDynLibs(gpuInfo)
  110. // Check to see if the user has requested a specific library instead of auto-detecting
  111. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  112. if demandLib != "" {
  113. libPath := availableDynLibs[demandLib]
  114. if libPath == "" {
  115. slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
  116. } else {
  117. slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
  118. dynLibs = []string{libPath}
  119. }
  120. }
  121. err2 := fmt.Errorf("unable to locate suitable llm library")
  122. for _, dynLib := range dynLibs {
  123. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  124. if err == nil {
  125. return srv, nil
  126. }
  127. slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
  128. err2 = err
  129. }
  130. return nil, err2
  131. }