ext_server.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. package llm
  2. /*
  3. #cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common
  4. #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
  5. #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
  6. #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
  7. #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
  8. #cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
  9. #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
  10. #cgo darwin LDFLAGS: -lc++ -framework Accelerate
  11. #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
  12. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
  13. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
  14. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
  15. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
  16. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
  17. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
  18. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
  19. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
  20. #cgo linux CFLAGS: -D_GNU_SOURCE
  21. #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
  22. #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
  23. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a
  24. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
  25. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
  26. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
  27. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
  28. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
  29. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
  30. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a
  31. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a
  32. #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
  33. #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
  34. #cgo windows LDFLAGS: -lext_server_shared -lpthread
  35. #include <stdlib.h>
  36. #include "examples/server/server.h"
  37. */
  38. import "C"
  39. import (
  40. "bytes"
  41. "context"
  42. "encoding/json"
  43. "errors"
  44. "fmt"
  45. "log"
  46. "os"
  47. "runtime"
  48. "sync"
  49. "time"
  50. "unsafe"
  51. "github.com/jmorganca/ollama/api"
  52. )
  53. func errWrap(resp C.ext_server_err) error {
  54. if resp.code == 0 {
  55. return nil
  56. }
  57. err := fmt.Errorf(C.GoString(resp.err))
  58. C.free(unsafe.Pointer(resp.err))
  59. return err
  60. }
  61. type llamaExtServer struct {
  62. api.Options
  63. }
  64. // Note: current implementation does not support concurrent instantiations
  65. var mutex sync.Mutex
  66. func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) {
  67. if !mutex.TryLock() {
  68. log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
  69. mutex.Lock()
  70. }
  71. server := &llamaExtServer{opts}
  72. fileInfo, err := os.Stat(model)
  73. if err != nil {
  74. return nil, err
  75. }
  76. var sparams C.ext_server_params
  77. sparams.model = C.CString(model)
  78. defer C.free(unsafe.Pointer(sparams.model))
  79. numGPU := NumGPU(numLayers, fileInfo.Size(), opts)
  80. sparams.embedding = true
  81. sparams.n_ctx = C.uint(opts.NumCtx)
  82. sparams.n_batch = C.uint(opts.NumBatch)
  83. sparams.n_gpu_layers = C.int(numGPU)
  84. sparams.main_gpu = C.int(opts.MainGPU)
  85. sparams.n_parallel = 2 // TODO - wire up concurrency
  86. // Always use the value encoded in the model
  87. sparams.rope_freq_base = 0.0
  88. sparams.rope_freq_scale = 0.0
  89. sparams.lora_adapters = nil
  90. for i := 0; i < len(adapters); i++ {
  91. la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter))
  92. defer C.free(unsafe.Pointer(la))
  93. la.adapter = C.CString(adapters[i])
  94. defer C.free(unsafe.Pointer(la.adapter))
  95. la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
  96. la.next = nil
  97. if i == 0 {
  98. sparams.lora_adapters = la
  99. } else {
  100. tmp := sparams.lora_adapters
  101. for ; tmp.next != nil; tmp = tmp.next {
  102. }
  103. tmp.next = la
  104. }
  105. }
  106. // TODO - implement ME
  107. // if len(projectors) > 0 {
  108. // // TODO: applying multiple projectors is not supported by the llama.cpp server yet
  109. // params = append(params, "--mmproj", projectors[0])
  110. // }
  111. if opts.NumThread > 0 {
  112. sparams.n_threads = C.uint(opts.NumThread)
  113. } else {
  114. sparams.n_threads = C.uint(runtime.NumCPU())
  115. }
  116. sparams.memory_f16 = false
  117. if opts.F16KV {
  118. sparams.memory_f16 = true
  119. }
  120. sparams.use_mlock = false
  121. if opts.UseMLock {
  122. sparams.use_mlock = true
  123. }
  124. sparams.use_mmap = true
  125. if !opts.UseMMap {
  126. sparams.use_mmap = false
  127. }
  128. sparams.numa = false
  129. if opts.UseNUMA {
  130. sparams.numa = true
  131. }
  132. log.Printf("Initializing internal llama server")
  133. err = errWrap(C.llama_server_init(&sparams))
  134. if err != nil {
  135. return nil, err
  136. }
  137. log.Printf("Starting internal llama main loop")
  138. C.llama_server_start()
  139. return server, nil
  140. }
  141. func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
  142. request := map[string]any{
  143. "prompt": predict.Prompt,
  144. "stream": true,
  145. "n_predict": llm.NumPredict,
  146. "n_keep": llm.NumKeep,
  147. "temperature": llm.Temperature,
  148. "top_k": llm.TopK,
  149. "top_p": llm.TopP,
  150. "tfs_z": llm.TFSZ,
  151. "typical_p": llm.TypicalP,
  152. "repeat_last_n": llm.RepeatLastN,
  153. "repeat_penalty": llm.RepeatPenalty,
  154. "presence_penalty": llm.PresencePenalty,
  155. "frequency_penalty": llm.FrequencyPenalty,
  156. "mirostat": llm.Mirostat,
  157. "mirostat_tau": llm.MirostatTau,
  158. "mirostat_eta": llm.MirostatEta,
  159. "penalize_nl": llm.PenalizeNewline,
  160. "seed": llm.Seed,
  161. "stop": llm.Stop,
  162. }
  163. if predict.Format == "json" {
  164. request["grammar"] = jsonGrammar
  165. }
  166. // Handling JSON marshaling with special characters unescaped.
  167. buffer := &bytes.Buffer{}
  168. enc := json.NewEncoder(buffer)
  169. enc.SetEscapeHTML(false)
  170. if err := enc.Encode(request); err != nil {
  171. return fmt.Errorf("failed to marshal data: %w", err)
  172. }
  173. req := C.CString(buffer.String())
  174. defer C.free(unsafe.Pointer(req))
  175. cmpCtx := C.llama_server_completion(req)
  176. if cmpCtx.task_id < 0 {
  177. defer C.free(unsafe.Pointer(cmpCtx.err))
  178. return fmt.Errorf(C.GoString(cmpCtx.err))
  179. }
  180. for {
  181. select {
  182. case <-ctx.Done():
  183. // This handles the request cancellation
  184. return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
  185. default:
  186. result := C.llama_server_completion_next_result(cmpCtx.task_id)
  187. if result.result_json != nil {
  188. defer C.free(unsafe.Pointer(result.result_json))
  189. }
  190. var p prediction
  191. if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil {
  192. err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id))
  193. return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2)
  194. }
  195. if p.Content != "" {
  196. fn(PredictResult{
  197. // Model: predict.Model, // XXX remove or replace?
  198. CreatedAt: time.Now().UTC(),
  199. Content: p.Content,
  200. })
  201. }
  202. if p.Stop {
  203. fn(PredictResult{
  204. // Model: predict.Model, // XXX remove or replace?
  205. CreatedAt: time.Now().UTC(),
  206. TotalDuration: time.Since(predict.CheckpointStart),
  207. Done: true,
  208. PromptEvalCount: p.Timings.PromptN,
  209. PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
  210. EvalCount: p.Timings.PredictedN,
  211. EvalDuration: parseDurationMs(p.Timings.PredictedMS),
  212. })
  213. return nil
  214. }
  215. }
  216. }
  217. }
  218. func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
  219. data, err := json.Marshal(TokenizeRequest{Content: prompt})
  220. if err != nil {
  221. return nil, fmt.Errorf("marshaling encode data: %w", err)
  222. }
  223. req := C.CString(string(data))
  224. defer C.free(unsafe.Pointer(req))
  225. var resp C.ext_server_resp
  226. err = errWrap(C.llama_server_tokenize(req, &resp))
  227. if resp.json_resp != nil {
  228. defer C.free(unsafe.Pointer(resp.json_resp))
  229. }
  230. var encoded TokenizeResponse
  231. if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil {
  232. return nil, fmt.Errorf("unmarshal encode response: %w", err2)
  233. }
  234. return encoded.Tokens, err
  235. }
  236. func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
  237. if len(tokens) == 0 {
  238. return "", nil
  239. }
  240. data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
  241. if err != nil {
  242. return "", fmt.Errorf("marshaling decode data: %w", err)
  243. }
  244. req := C.CString(string(data))
  245. defer C.free(unsafe.Pointer(req))
  246. var resp C.ext_server_resp
  247. err = errWrap(C.llama_server_detokenize(req, &resp))
  248. if resp.json_resp != nil {
  249. defer C.free(unsafe.Pointer(resp.json_resp))
  250. }
  251. var decoded DetokenizeResponse
  252. if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil {
  253. return "", fmt.Errorf("unmarshal encode response: %w", err2)
  254. }
  255. return decoded.Content, err
  256. }
  257. func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
  258. data, err := json.Marshal(TokenizeRequest{Content: input})
  259. if err != nil {
  260. return nil, fmt.Errorf("error marshaling embed data: %w", err)
  261. }
  262. req := C.CString(string(data))
  263. defer C.free(unsafe.Pointer(req))
  264. var resp C.ext_server_resp
  265. err = errWrap(C.llama_server_embedding(req, &resp))
  266. if resp.json_resp != nil {
  267. defer C.free(unsafe.Pointer(resp.json_resp))
  268. }
  269. if err != nil {
  270. return nil, err
  271. }
  272. var embedding EmbeddingResponse
  273. if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil {
  274. return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
  275. }
  276. return embedding.Embedding, nil
  277. }
  278. func (llm *llamaExtServer) Ping(ctx context.Context) error {
  279. // TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state
  280. return nil
  281. }
  282. func (llm *llamaExtServer) Close() {
  283. C.llama_server_stop()
  284. mutex.Unlock()
  285. }