llm.go 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. package llm
  2. import (
  3. "context"
  4. "log"
  5. "os"
  6. "runtime"
  7. "github.com/jmorganca/ollama/api"
  8. "github.com/jmorganca/ollama/gpu"
  9. )
  10. type LLM interface {
  11. Predict(context.Context, PredictOpts, func(PredictResult)) error
  12. Embedding(context.Context, string) ([]float64, error)
  13. Encode(context.Context, string) ([]int, error)
  14. Decode(context.Context, []int) (string, error)
  15. Close()
  16. }
  17. var AvailableShims = map[string]string{}
  18. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  19. if _, err := os.Stat(model); err != nil {
  20. return nil, err
  21. }
  22. f, err := os.Open(model)
  23. if err != nil {
  24. return nil, err
  25. }
  26. defer f.Close()
  27. ggml, err := DecodeGGML(f)
  28. if err != nil {
  29. return nil, err
  30. }
  31. if opts.NumCtx < 4 {
  32. opts.NumCtx = 4
  33. }
  34. vram, _ := gpu.CheckVRAM()
  35. size := ggml.Size
  36. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  37. kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  38. // this amount is the overhead + tensors in memory
  39. // TODO: get this from the llama.cpp's graph calcluations instead of
  40. // estimating it's 1/6 * kv_cache_size * num_gqa
  41. graph := int64(ggml.NumGQA()) * kv / 6
  42. info := gpu.GetGPUInfo()
  43. library := info.Library
  44. switch runtime.GOOS {
  45. case "darwin":
  46. if opts.NumGPU == 0 {
  47. break
  48. }
  49. if size+kv+graph > vram {
  50. log.Println("not enough vram available, falling back to CPU only")
  51. opts.NumGPU = 0
  52. break
  53. }
  54. opts.NumGPU = 1
  55. default:
  56. if library == "cpu" || library == "default" {
  57. log.Println("GPU not available, falling back to CPU")
  58. opts.NumGPU = 0
  59. break
  60. }
  61. // don't use GPU at all if no layers are loaded
  62. if opts.NumGPU == 0 {
  63. library = "cpu"
  64. break
  65. }
  66. // user-defined GPU count
  67. if opts.NumGPU != -1 {
  68. break
  69. }
  70. // the "main" GPU needs the most memory and determines the limit
  71. // of how many layers can be loaded. It needs to fit:
  72. // 1. the full compute graph allocation for all devices (graph)
  73. // 2. the proportional kv cache for all devices (kv * % layers)
  74. // 3. the proportional model (size * % layers / # devices)
  75. // This estimates the number of layers
  76. maxlayers := int64(ggml.NumLayers()) + 1
  77. devices := int64(info.DeviceCount)
  78. avg := vram / devices
  79. layers := maxlayers * (avg - graph) / (kv + size/devices)
  80. if layers > maxlayers {
  81. layers = maxlayers
  82. }
  83. // 1 + 2 must fit on the main gpu
  84. min := graph + kv*layers/maxlayers
  85. if layers <= 0 || min > avg {
  86. log.Printf("not enough vram available, falling back to CPU only")
  87. library = "cpu"
  88. opts.NumGPU = 0
  89. break
  90. }
  91. opts.NumGPU = int(layers)
  92. }
  93. opts.RopeFrequencyBase = 0.0
  94. opts.RopeFrequencyScale = 0.0
  95. return newLlmServer(library, model, adapters, projectors, opts)
  96. }
  97. // Give any native cgo implementations an opportunity to initialize
  98. func Init(workdir string) error {
  99. return nativeInit(workdir)
  100. }
  101. func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
  102. if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
  103. srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
  104. if err == nil {
  105. return srv, nil
  106. }
  107. log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
  108. // TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
  109. }
  110. return newDefaultExtServer(model, adapters, projectors, opts)
  111. }