ext_server.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. package llm
  2. /*
  3. #cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
  4. #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
  5. #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
  6. #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
  7. #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
  8. #cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
  9. #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
  10. #cgo darwin LDFLAGS: -lc++ -framework Accelerate
  11. #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
  12. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
  13. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
  14. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
  15. #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
  16. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
  17. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
  18. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
  19. #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
  20. #cgo linux CFLAGS: -D_GNU_SOURCE
  21. #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
  22. #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
  23. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a
  24. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
  25. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
  26. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
  27. // Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only
  28. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
  29. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
  30. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
  31. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a
  32. #cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a
  33. #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
  34. #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
  35. #cgo windows LDFLAGS: -lext_server_shared -lpthread
  36. #include <stdlib.h>
  37. #include "server.h"
  38. */
  39. import "C"
  40. import (
  41. "bytes"
  42. "context"
  43. "encoding/json"
  44. "fmt"
  45. "log"
  46. "os"
  47. "runtime"
  48. "strings"
  49. "sync"
  50. "time"
  51. "unsafe"
  52. "github.com/jmorganca/ollama/api"
  53. "github.com/jmorganca/ollama/gpu"
  54. )
  55. func newExtServerResp(len C.size_t) C.ext_server_resp_t {
  56. var resp C.ext_server_resp_t
  57. resp.msg_len = len
  58. bytes := make([]byte, len)
  59. resp.msg = (*C.char)(C.CBytes(bytes))
  60. return resp
  61. }
  62. func freeExtServerResp(resp C.ext_server_resp_t) {
  63. if resp.msg_len == 0 {
  64. return
  65. }
  66. C.free(unsafe.Pointer(resp.msg))
  67. }
  68. func extServerResponseToErr(resp C.ext_server_resp_t) error {
  69. return fmt.Errorf(C.GoString(resp.msg))
  70. }
  71. type extServer interface {
  72. LLM
  73. llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
  74. llama_server_start()
  75. llama_server_stop()
  76. llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
  77. llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
  78. llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
  79. llama_server_release_task_result(result *C.ext_server_task_result_t)
  80. llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
  81. llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
  82. llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
  83. llama_server_release_json_resp(json_resp **C.char)
  84. }
  85. type llamaExtServer struct {
  86. api.Options
  87. }
  88. // Note: current implementation does not support concurrent instantiations
  89. var mutex sync.Mutex
  90. func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
  91. C.llama_server_init(sparams, err)
  92. }
  93. func (llm *llamaExtServer) llama_server_start() {
  94. C.llama_server_start()
  95. }
  96. func (llm *llamaExtServer) llama_server_stop() {
  97. C.llama_server_stop()
  98. }
  99. func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
  100. C.llama_server_completion(json_req, resp)
  101. }
  102. func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
  103. C.llama_server_completion_next_result(task_id, resp)
  104. }
  105. func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
  106. C.llama_server_completion_cancel(task_id, err)
  107. }
  108. func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
  109. C.llama_server_release_task_result(result)
  110. }
  111. func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
  112. C.llama_server_tokenize(json_req, json_resp, err)
  113. }
  114. func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
  115. C.llama_server_detokenize(json_req, json_resp, err)
  116. }
  117. func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
  118. C.llama_server_embedding(json_req, json_resp, err)
  119. }
  120. func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
  121. C.llama_server_release_json_resp(json_resp)
  122. }
  123. func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
  124. server := &llamaExtServer{opts}
  125. return newExtServer(server, model, adapters, projectors, numLayers, opts)
  126. }
  127. func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
  128. if !mutex.TryLock() {
  129. log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
  130. mutex.Lock()
  131. }
  132. fileInfo, err := os.Stat(model)
  133. if err != nil {
  134. return nil, err
  135. }
  136. var sparams C.ext_server_params_t
  137. sparams.model = C.CString(model)
  138. defer C.free(unsafe.Pointer(sparams.model))
  139. numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
  140. sparams.embedding = true
  141. sparams.n_ctx = C.uint(opts.NumCtx)
  142. sparams.n_batch = C.uint(opts.NumBatch)
  143. sparams.n_gpu_layers = C.int(numGPU)
  144. sparams.main_gpu = C.int(opts.MainGPU)
  145. sparams.n_parallel = 1 // TODO - wire up concurrency
  146. // Always use the value encoded in the model
  147. sparams.rope_freq_base = 0.0
  148. sparams.rope_freq_scale = 0.0
  149. sparams.memory_f16 = C.bool(opts.F16KV)
  150. sparams.use_mlock = C.bool(opts.UseMLock)
  151. sparams.use_mmap = C.bool(opts.UseMMap)
  152. sparams.numa = C.bool(opts.UseNUMA)
  153. sparams.lora_adapters = nil
  154. for i := 0; i < len(adapters); i++ {
  155. la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
  156. defer C.free(unsafe.Pointer(la))
  157. la.adapter = C.CString(adapters[i])
  158. defer C.free(unsafe.Pointer(la.adapter))
  159. la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
  160. la.next = nil
  161. if i == 0 {
  162. sparams.lora_adapters = la
  163. } else {
  164. tmp := sparams.lora_adapters
  165. for ; tmp.next != nil; tmp = tmp.next {
  166. }
  167. tmp.next = la
  168. }
  169. }
  170. if len(projectors) > 0 {
  171. // TODO: applying multiple projectors is not supported by the llama.cpp server yet
  172. sparams.mmproj = C.CString(projectors[0])
  173. defer C.free(unsafe.Pointer(sparams.mmproj))
  174. } else {
  175. sparams.mmproj = nil
  176. }
  177. if opts.NumThread > 0 {
  178. sparams.n_threads = C.uint(opts.NumThread)
  179. } else {
  180. sparams.n_threads = C.uint(runtime.NumCPU())
  181. }
  182. log.Printf("Initializing internal llama server")
  183. resp := newExtServerResp(128)
  184. defer freeExtServerResp(resp)
  185. server.llama_server_init(&sparams, &resp)
  186. if resp.id < 0 {
  187. return nil, extServerResponseToErr(resp)
  188. }
  189. log.Printf("Starting internal llama main loop")
  190. server.llama_server_start()
  191. return server, nil
  192. }
  193. func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
  194. return predict(llm, llm.Options, ctx, pred, fn)
  195. }
  196. func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
  197. resp := newExtServerResp(128)
  198. defer freeExtServerResp(resp)
  199. var imageData []ImageData
  200. if len(predict.Images) > 0 {
  201. for cnt, i := range predict.Images {
  202. imageData = append(imageData, ImageData{Data: i, ID: cnt})
  203. }
  204. }
  205. log.Printf("loaded %d images", len(imageData))
  206. request := map[string]any{
  207. "prompt": predict.Prompt,
  208. "stream": true,
  209. "n_predict": opts.NumPredict,
  210. "n_keep": opts.NumKeep,
  211. "temperature": opts.Temperature,
  212. "top_k": opts.TopK,
  213. "top_p": opts.TopP,
  214. "tfs_z": opts.TFSZ,
  215. "typical_p": opts.TypicalP,
  216. "repeat_last_n": opts.RepeatLastN,
  217. "repeat_penalty": opts.RepeatPenalty,
  218. "presence_penalty": opts.PresencePenalty,
  219. "frequency_penalty": opts.FrequencyPenalty,
  220. "mirostat": opts.Mirostat,
  221. "mirostat_tau": opts.MirostatTau,
  222. "mirostat_eta": opts.MirostatEta,
  223. "penalize_nl": opts.PenalizeNewline,
  224. "seed": opts.Seed,
  225. "stop": opts.Stop,
  226. "image_data": imageData,
  227. }
  228. if predict.Format == "json" {
  229. request["grammar"] = jsonGrammar
  230. }
  231. retryDelay := 100 * time.Microsecond
  232. for retries := 0; retries < maxRetries; retries++ {
  233. if retries > 0 {
  234. time.Sleep(retryDelay) // wait before retrying
  235. retryDelay *= 2 // exponential backoff
  236. }
  237. // Handling JSON marshaling with special characters unescaped.
  238. buffer := &bytes.Buffer{}
  239. enc := json.NewEncoder(buffer)
  240. enc.SetEscapeHTML(false)
  241. if err := enc.Encode(request); err != nil {
  242. return fmt.Errorf("failed to marshal data: %w", err)
  243. }
  244. req := C.CString(buffer.String())
  245. defer C.free(unsafe.Pointer(req))
  246. llm.llama_server_completion(req, &resp)
  247. if resp.id < 0 {
  248. return extServerResponseToErr(resp)
  249. }
  250. retryNeeded := false
  251. out:
  252. for {
  253. select {
  254. case <-ctx.Done():
  255. // This handles the request cancellation
  256. llm.llama_server_completion_cancel(resp.id, &resp)
  257. if resp.id < 0 {
  258. return extServerResponseToErr(resp)
  259. } else {
  260. return nil
  261. }
  262. default:
  263. var result C.ext_server_task_result_t
  264. llm.llama_server_completion_next_result(resp.id, &result)
  265. json_resp := C.GoString(result.json_resp)
  266. llm.llama_server_release_task_result(&result)
  267. var p prediction
  268. if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
  269. llm.llama_server_completion_cancel(resp.id, &resp)
  270. if resp.id < 0 {
  271. return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
  272. } else {
  273. return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
  274. }
  275. }
  276. if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
  277. retryNeeded = true
  278. // task will already be canceled
  279. break out
  280. }
  281. if p.Content != "" {
  282. fn(PredictResult{
  283. Content: p.Content,
  284. })
  285. }
  286. if p.Stop {
  287. fn(PredictResult{
  288. Done: true,
  289. PromptEvalCount: p.Timings.PromptN,
  290. PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
  291. EvalCount: p.Timings.PredictedN,
  292. EvalDuration: parseDurationMs(p.Timings.PredictedMS),
  293. })
  294. return nil
  295. }
  296. }
  297. }
  298. if !retryNeeded {
  299. return nil // success
  300. }
  301. }
  302. // should never reach here ideally
  303. return fmt.Errorf("max retries exceeded")
  304. }
  305. func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
  306. return encode(llm, ctx, prompt)
  307. }
  308. func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
  309. data, err := json.Marshal(TokenizeRequest{Content: prompt})
  310. if err != nil {
  311. return nil, fmt.Errorf("marshaling encode data: %w", err)
  312. }
  313. req := C.CString(string(data))
  314. defer C.free(unsafe.Pointer(req))
  315. var json_resp *C.char
  316. resp := newExtServerResp(128)
  317. defer freeExtServerResp(resp)
  318. llm.llama_server_tokenize(req, &json_resp, &resp)
  319. if resp.id < 0 {
  320. return nil, extServerResponseToErr(resp)
  321. }
  322. defer llm.llama_server_release_json_resp(&json_resp)
  323. var encoded TokenizeResponse
  324. if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
  325. return nil, fmt.Errorf("unmarshal encode response: %w", err2)
  326. }
  327. return encoded.Tokens, err
  328. }
  329. func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
  330. return decode(llm, ctx, tokens)
  331. }
  332. func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
  333. if len(tokens) == 0 {
  334. return "", nil
  335. }
  336. data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
  337. if err != nil {
  338. return "", fmt.Errorf("marshaling decode data: %w", err)
  339. }
  340. req := C.CString(string(data))
  341. defer C.free(unsafe.Pointer(req))
  342. var json_resp *C.char
  343. resp := newExtServerResp(128)
  344. defer freeExtServerResp(resp)
  345. llm.llama_server_detokenize(req, &json_resp, &resp)
  346. if resp.id < 0 {
  347. return "", extServerResponseToErr(resp)
  348. }
  349. defer llm.llama_server_release_json_resp(&json_resp)
  350. var decoded DetokenizeResponse
  351. if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
  352. return "", fmt.Errorf("unmarshal encode response: %w", err2)
  353. }
  354. return decoded.Content, err
  355. }
  356. func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
  357. return embedding(llm, ctx, input)
  358. }
  359. func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
  360. data, err := json.Marshal(TokenizeRequest{Content: input})
  361. if err != nil {
  362. return nil, fmt.Errorf("error marshaling embed data: %w", err)
  363. }
  364. req := C.CString(string(data))
  365. defer C.free(unsafe.Pointer(req))
  366. var json_resp *C.char
  367. resp := newExtServerResp(128)
  368. defer freeExtServerResp(resp)
  369. llm.llama_server_embedding(req, &json_resp, &resp)
  370. if resp.id < 0 {
  371. return nil, extServerResponseToErr(resp)
  372. }
  373. defer llm.llama_server_release_json_resp(&json_resp)
  374. var embedding EmbeddingResponse
  375. if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
  376. return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
  377. }
  378. return embedding.Embedding, nil
  379. }
  380. func (llm *llamaExtServer) Close() {
  381. close(llm)
  382. }
  383. func close(llm extServer) {
  384. llm.llama_server_stop()
  385. mutex.Unlock()
  386. }