llm.go 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log"
  6. "os"
  7. "runtime"
  8. "github.com/jmorganca/ollama/api"
  9. "github.com/jmorganca/ollama/gpu"
  10. )
  11. type LLM interface {
  12. Predict(context.Context, PredictOpts, func(PredictResult)) error
  13. Embedding(context.Context, string) ([]float64, error)
  14. Encode(context.Context, string) ([]int, error)
  15. Decode(context.Context, []int) (string, error)
  16. Close()
  17. }
  18. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  19. if _, err := os.Stat(model); err != nil {
  20. return nil, err
  21. }
  22. f, err := os.Open(model)
  23. if err != nil {
  24. return nil, err
  25. }
  26. defer f.Close()
  27. ggml, err := DecodeGGML(f)
  28. if err != nil {
  29. return nil, err
  30. }
  31. if opts.NumCtx < 4 {
  32. opts.NumCtx = 4
  33. }
  34. vram, _ := gpu.CheckVRAM()
  35. size := ggml.Size
  36. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  37. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  38. // this amount is the overhead + tensors in memory
  39. // TODO: get this from the llama.cpp's graph calculations instead of
  40. // estimating it's 1/6 * kv_cache_size * num_gqa
  41. graph := int64(ggml.NumGQA()) * kv / 6
  42. info := gpu.GetGPUInfo()
  43. library := info.Library
  44. switch runtime.GOOS {
  45. case "darwin":
  46. if opts.NumGPU == 0 {
  47. break
  48. }
  49. if size+kv+graph > vram {
  50. log.Println("not enough vram available, falling back to CPU only")
  51. opts.NumGPU = 0
  52. break
  53. }
  54. opts.NumGPU = 1
  55. default:
  56. if library == "cpu" || library == "default" {
  57. log.Println("GPU not available, falling back to CPU")
  58. opts.NumGPU = 0
  59. break
  60. }
  61. // don't use GPU at all if no layers are loaded
  62. if opts.NumGPU == 0 {
  63. library = "cpu"
  64. break
  65. }
  66. // user-defined GPU count
  67. if opts.NumGPU != -1 {
  68. break
  69. }
  70. // the "main" GPU needs the most memory and determines the limit
  71. // of how many layers can be loaded. It needs to fit:
  72. // 1. the full compute graph allocation for all devices (graph)
  73. // 2. the proportional kv cache for all devices (kv * % layers)
  74. // 3. the proportional model (size * % layers / # devices)
  75. // This estimates the number of layers
  76. maxlayers := int64(ggml.NumLayers()) + 1
  77. devices := int64(info.DeviceCount)
  78. avg := vram / devices
  79. layers := maxlayers * (avg - graph) / (kv + size/devices)
  80. if layers > maxlayers {
  81. layers = maxlayers
  82. }
  83. // 1 + 2 must fit on the main gpu
  84. min := graph + kv*layers/maxlayers
  85. if layers <= 0 || min > avg {
  86. log.Printf("not enough vram available, falling back to CPU only")
  87. library = "cpu"
  88. opts.NumGPU = 0
  89. break
  90. }
  91. opts.NumGPU = int(layers)
  92. }
  93. opts.RopeFrequencyBase = 0.0
  94. opts.RopeFrequencyScale = 0.0
  95. gpuInfo := gpu.GetGPUInfo()
  96. return newLlmServer(gpuInfo, model, adapters, projectors, opts)
  97. }
  98. // Give any native cgo implementations an opportunity to initialize
  99. func Init(workdir string) error {
  100. return nativeInit(workdir)
  101. }
  102. func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  103. dynLibs := getDynLibs(gpuInfo)
  104. // Check to see if the user has requested a specific library instead of auto-detecting
  105. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  106. if demandLib != "" {
  107. libPath := availableDynLibs[demandLib]
  108. if libPath == "" {
  109. log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
  110. } else {
  111. log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
  112. dynLibs = []string{libPath}
  113. }
  114. }
  115. err2 := fmt.Errorf("unable to locate suitable llm library")
  116. for _, dynLib := range dynLibs {
  117. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  118. if err == nil {
  119. return srv, nil
  120. }
  121. log.Printf("Failed to load dynamic library %s %s", dynLib, err)
  122. err2 = err
  123. }
  124. return nil, err2
  125. }