llm.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "slices"
  8. "strings"
  9. "github.com/ollama/ollama/api"
  10. "github.com/ollama/ollama/format"
  11. "github.com/ollama/ollama/gpu"
  12. )
  13. type LLM interface {
  14. Predict(context.Context, PredictOpts, func(PredictResult)) error
  15. Embedding(context.Context, string) ([]float64, error)
  16. Encode(context.Context, string) ([]int, error)
  17. Decode(context.Context, []int) (string, error)
  18. Close()
  19. }
  20. var cpuOnlyFamilies = []string{
  21. "mamba",
  22. }
  23. func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
  24. if _, err := os.Stat(model); err != nil {
  25. return nil, err
  26. }
  27. f, err := os.Open(model)
  28. if err != nil {
  29. return nil, err
  30. }
  31. defer f.Close()
  32. ggml, _, err := DecodeGGML(f)
  33. if err != nil {
  34. return nil, err
  35. }
  36. if opts.NumCtx > int(ggml.KV().ContextLength()) {
  37. slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
  38. opts.NumCtx = int(ggml.KV().ContextLength())
  39. }
  40. if opts.NumCtx < 4 {
  41. opts.NumCtx = 4
  42. }
  43. availableMemory, _ := gpu.CheckVRAM()
  44. info := gpu.GetGPUInfo()
  45. usedMemory := info.MinimumMemory
  46. for _, projector := range projectors {
  47. usedMemory += projectorMemoryRequirements(projector)
  48. // multimodal models require at least 2048 context
  49. opts.NumCtx = max(opts.NumCtx, 2048)
  50. }
  51. // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
  52. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
  53. // this amount is the overhead + tensors in memory
  54. // TODO: get this from the llama.cpp's graph calculations instead of
  55. // estimating it's 1/6 * kv_cache_size * num_gqa
  56. graph := int64(ggml.KV().GQA()) * kv / 6
  57. usedMemory += graph
  58. if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
  59. info.Library = "cpu"
  60. }
  61. requiredMemory := usedMemory
  62. var layers int
  63. for i := 0; i < int(ggml.KV().BlockCount()); i++ {
  64. layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
  65. requiredMemory += layerMemory
  66. if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
  67. usedMemory += layerMemory
  68. layers++
  69. }
  70. }
  71. memOutputLayer := ggml.LayerSize("output.")
  72. requiredMemory += memOutputLayer
  73. // only offload output layer if all repeating layers are offloaded
  74. if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
  75. usedMemory += memOutputLayer
  76. layers++
  77. }
  78. slog.Info(
  79. "offload to gpu",
  80. "layers", layers,
  81. "required", format.HumanBytes2(requiredMemory),
  82. "used", format.HumanBytes2(usedMemory),
  83. "available", format.HumanBytes2(availableMemory),
  84. "kv", format.HumanBytes2(kv),
  85. "graph", format.HumanBytes2(graph),
  86. )
  87. if opts.NumGPU < 0 && info.Library != "cpu" {
  88. opts.NumGPU = layers
  89. }
  90. return newLlmServer(info, model, adapters, projectors, opts)
  91. }
  92. func projectorMemoryRequirements(filename string) int64 {
  93. file, err := os.Open(filename)
  94. if err != nil {
  95. return 0
  96. }
  97. defer file.Close()
  98. ggml, _, err := DecodeGGML(file)
  99. if err != nil {
  100. return 0
  101. }
  102. prefixes := make(map[string]struct{})
  103. for _, layer := range ggml.Tensors() {
  104. parts := strings.Split(layer.Name, ".")
  105. prefixes[strings.Join(parts[:2], ".")] = struct{}{}
  106. }
  107. var ask int64
  108. for prefix := range prefixes {
  109. ask += ggml.LayerSize(prefix)
  110. }
  111. return ask
  112. }
  113. // Give any native cgo implementations an opportunity to initialize
  114. func Init() error {
  115. return nativeInit()
  116. }
  117. func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
  118. dynLibs := getDynLibs(gpuInfo)
  119. // Check to see if the user has requested a specific library instead of auto-detecting
  120. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  121. if demandLib != "" {
  122. libPath := availableDynLibs[demandLib]
  123. if libPath == "" {
  124. slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
  125. } else {
  126. slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
  127. dynLibs = []string{libPath}
  128. }
  129. }
  130. // We stage into a temp directory, and if we've been idle for a while, it may have been reaped
  131. _, err := os.Stat(dynLibs[0])
  132. if err != nil {
  133. slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
  134. err = nativeInit()
  135. if err != nil {
  136. return nil, err
  137. }
  138. }
  139. err2 := fmt.Errorf("unable to locate suitable llm library")
  140. for _, dynLib := range dynLibs {
  141. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  142. if err == nil {
  143. return srv, nil
  144. }
  145. slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))
  146. err2 = err
  147. }
  148. return nil, err2
  149. }