llm.go 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. package llm
  2. import (
  3. "context"
  4. "fmt"
  5. "log"
  6. "os"
  7. "runtime"
  8. "github.com/jmorganca/ollama/api"
  9. "github.com/jmorganca/ollama/gpu"
  10. )
  11. type LLM interface {
  12. Predict(context.Context, PredictOpts, func(PredictResult)) error
  13. Embedding(context.Context, string) ([]float64, error)
  14. Encode(context.Context, string) ([]int, error)
  15. Decode(context.Context, []int) (string, error)
  16. Close()
  17. }
  18. func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  19. if _, err := os.Stat(model); err != nil {
  20. return nil, err
  21. }
  22. f, err := os.Open(model)
  23. if err != nil {
  24. return nil, err
  25. }
  26. defer f.Close()
  27. ggml, err := DecodeGGML(f)
  28. if err != nil {
  29. return nil, err
  30. }
  31. if opts.NumCtx < 4 {
  32. opts.NumCtx = 4
  33. }
  34. fmt.Println("size", ggml.Size)
  35. fmt.Println("filetype", ggml.FileType())
  36. fmt.Println("architecture", ggml.ModelFamily())
  37. fmt.Println("type", ggml.ModelType())
  38. fmt.Println("name", ggml.Name())
  39. fmt.Println("embd", ggml.NumEmbed())
  40. fmt.Println("head", ggml.NumHead())
  41. fmt.Println("head_kv", ggml.NumHeadKv())
  42. fmt.Println("gqa", ggml.NumGQA())
  43. available, _ := gpu.CheckVRAM()
  44. // For now assume filesize = model size
  45. // TODO: use actual model size
  46. requiredModel := ggml.Size
  47. // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
  48. requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
  49. // this amount is the overhead + tensors in memory
  50. // TODO: get this from the llama.cpp's graph calcluations instead of
  51. // estimating it's 1/6 * kv_cache_size * num_gqa
  52. requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
  53. requiredTotal := requiredModel + requiredKv + requiredAlloc
  54. log.Println("system memory bytes:", available)
  55. log.Println("required model bytes:", requiredModel)
  56. log.Println("required kv bytes:", requiredKv)
  57. log.Println("required alloc bytes:", requiredAlloc)
  58. log.Println("required total bytes:", requiredTotal)
  59. info := gpu.GetGPUInfo()
  60. library := info.Library
  61. if opts.NumGPU == -1 {
  62. // default to offloading all layers
  63. opts.NumGPU = int(ggml.NumLayers()) + 1
  64. }
  65. // decide how many layers to put on the GPU
  66. if opts.NumGPU > 0 {
  67. switch runtime.GOOS {
  68. case "darwin":
  69. if requiredTotal > available {
  70. log.Println("not enough vram available, falling back to CPU only")
  71. opts.NumGPU = 0
  72. }
  73. default:
  74. if library == "cpu" || library == "default" {
  75. opts.NumGPU = 0
  76. break
  77. }
  78. // alloc buffer and kv cache is allocated as a fixed amount on the main gpu
  79. // TODO: find the largest GPU and only reserve memory there
  80. avgAvailable := available / int64(info.DeviceCount)
  81. if requiredAlloc > avgAvailable {
  82. log.Printf("not enough vram available, falling back to CPU only")
  83. library = "cpu"
  84. opts.NumGPU = 0
  85. break
  86. }
  87. // we don't know which GPU will be used, so estimate
  88. // the scratch buffer space on all of them
  89. // TODO: allocate less layers to the GPU with the scratch buffer
  90. // and more to the others (based on their available memory)
  91. available -= requiredAlloc * int64(info.DeviceCount)
  92. // no offloading required
  93. if requiredModel+requiredKv <= available {
  94. break
  95. }
  96. // fill remaining vram with layers
  97. log.Println("splitting", available, "of available memory bytes into layers")
  98. bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
  99. log.Println("bytes per layer:", bytesPerLayer)
  100. layers := available / bytesPerLayer
  101. log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
  102. if layers < int64(opts.NumGPU) {
  103. opts.NumGPU = int(layers)
  104. }
  105. }
  106. }
  107. opts.NumGQA = 0
  108. opts.RopeFrequencyBase = 0.0
  109. opts.RopeFrequencyScale = 0.0
  110. gpuInfo := gpu.GetGPUInfo()
  111. return newLlmServer(gpuInfo, model, adapters, projectors, opts)
  112. }
  113. // Give any native cgo implementations an opportunity to initialize
  114. func Init(workdir string) error {
  115. return nativeInit(workdir)
  116. }
  117. func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  118. dynLibs := getDynLibs(gpuInfo)
  119. // Check to see if the user has requested a specific library instead of auto-detecting
  120. demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
  121. if demandLib != "" {
  122. libPath := availableDynLibs[demandLib]
  123. if libPath == "" {
  124. log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
  125. } else {
  126. log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
  127. dynLibs = []string{libPath}
  128. }
  129. }
  130. err2 := fmt.Errorf("unable to locate suitable llm library")
  131. for _, dynLib := range dynLibs {
  132. srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
  133. if err == nil {
  134. return srv, nil
  135. }
  136. log.Printf("Failed to load dynamic library %s %s", dynLib, err)
  137. err2 = err
  138. }
  139. return nil, err2
  140. }