ext_server_common.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. package llm
  2. /*
  3. #cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
  4. #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
  5. #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
  6. #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
  7. #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
  8. #cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
  9. #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
  10. #cgo darwin LDFLAGS: -lc++ -framework Accelerate
  11. #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
  12. #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
  13. #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
  14. #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
  15. #cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
  16. #cgo linux CFLAGS: -D_GNU_SOURCE
  17. #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
  18. #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
  19. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
  20. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
  21. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
  22. #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
  23. #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
  24. #cgo linux windows LDFLAGS: -lpthread
  25. #include <stdlib.h>
  26. #include "ext_server.h"
  27. */
  28. import "C"
  29. import (
  30. "bytes"
  31. "context"
  32. "encoding/json"
  33. "fmt"
  34. "log"
  35. "os"
  36. "strings"
  37. "sync"
  38. "time"
  39. "unsafe"
  40. "github.com/jmorganca/ollama/api"
  41. "github.com/jmorganca/ollama/gpu"
  42. )
  43. type extServer interface {
  44. LLM
  45. llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
  46. llama_server_start()
  47. llama_server_stop()
  48. llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
  49. llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
  50. llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
  51. llama_server_release_task_result(result *C.ext_server_task_result_t)
  52. llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
  53. llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
  54. llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
  55. llama_server_release_json_resp(json_resp **C.char)
  56. }
  57. // Note: current implementation does not support concurrent instantiations
  58. var mutex sync.Mutex
  59. func newExtServerResp(len C.size_t) C.ext_server_resp_t {
  60. var resp C.ext_server_resp_t
  61. resp.msg_len = len
  62. bytes := make([]byte, len)
  63. resp.msg = (*C.char)(C.CBytes(bytes))
  64. return resp
  65. }
  66. func freeExtServerResp(resp C.ext_server_resp_t) {
  67. if resp.msg_len == 0 {
  68. return
  69. }
  70. C.free(unsafe.Pointer(resp.msg))
  71. }
  72. func extServerResponseToErr(resp C.ext_server_resp_t) error {
  73. return fmt.Errorf(C.GoString(resp.msg))
  74. }
  75. func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
  76. if !mutex.TryLock() {
  77. log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
  78. mutex.Lock()
  79. }
  80. fileInfo, err := os.Stat(model)
  81. if err != nil {
  82. return nil, err
  83. }
  84. var sparams C.ext_server_params_t
  85. sparams.model = C.CString(model)
  86. defer C.free(unsafe.Pointer(sparams.model))
  87. numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
  88. sparams.embedding = true
  89. sparams.n_ctx = C.uint(opts.NumCtx)
  90. sparams.n_batch = C.uint(opts.NumBatch)
  91. sparams.n_gpu_layers = C.int(numGPU)
  92. sparams.main_gpu = C.int(opts.MainGPU)
  93. sparams.n_parallel = 1 // TODO - wire up concurrency
  94. // Always use the value encoded in the model
  95. sparams.rope_freq_base = 0.0
  96. sparams.rope_freq_scale = 0.0
  97. sparams.memory_f16 = C.bool(opts.F16KV)
  98. sparams.use_mlock = C.bool(opts.UseMLock)
  99. sparams.use_mmap = C.bool(opts.UseMMap)
  100. sparams.numa = C.bool(opts.UseNUMA)
  101. sparams.lora_adapters = nil
  102. for i := 0; i < len(adapters); i++ {
  103. la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
  104. defer C.free(unsafe.Pointer(la))
  105. la.adapter = C.CString(adapters[i])
  106. defer C.free(unsafe.Pointer(la.adapter))
  107. la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
  108. la.next = nil
  109. if i == 0 {
  110. sparams.lora_adapters = la
  111. } else {
  112. tmp := sparams.lora_adapters
  113. for ; tmp.next != nil; tmp = tmp.next {
  114. }
  115. tmp.next = la
  116. }
  117. }
  118. if len(projectors) > 0 {
  119. // TODO: applying multiple projectors is not supported by the llama.cpp server yet
  120. sparams.mmproj = C.CString(projectors[0])
  121. defer C.free(unsafe.Pointer(sparams.mmproj))
  122. } else {
  123. sparams.mmproj = nil
  124. }
  125. sparams.n_threads = C.uint(opts.NumThread)
  126. log.Printf("Initializing internal llama server")
  127. resp := newExtServerResp(128)
  128. defer freeExtServerResp(resp)
  129. server.llama_server_init(&sparams, &resp)
  130. if resp.id < 0 {
  131. return nil, extServerResponseToErr(resp)
  132. }
  133. log.Printf("Starting internal llama main loop")
  134. server.llama_server_start()
  135. return server, nil
  136. }
  137. func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
  138. resp := newExtServerResp(128)
  139. defer freeExtServerResp(resp)
  140. var imageData []ImageData
  141. if len(predict.Images) > 0 {
  142. for cnt, i := range predict.Images {
  143. imageData = append(imageData, ImageData{Data: i, ID: cnt})
  144. }
  145. }
  146. log.Printf("loaded %d images", len(imageData))
  147. request := map[string]any{
  148. "prompt": predict.Prompt,
  149. "stream": true,
  150. "n_predict": predict.Options.NumPredict,
  151. "n_keep": predict.Options.NumKeep,
  152. "temperature": predict.Options.Temperature,
  153. "top_k": predict.Options.TopK,
  154. "top_p": predict.Options.TopP,
  155. "tfs_z": predict.Options.TFSZ,
  156. "typical_p": predict.Options.TypicalP,
  157. "repeat_last_n": predict.Options.RepeatLastN,
  158. "repeat_penalty": predict.Options.RepeatPenalty,
  159. "presence_penalty": predict.Options.PresencePenalty,
  160. "frequency_penalty": predict.Options.FrequencyPenalty,
  161. "mirostat": predict.Options.Mirostat,
  162. "mirostat_tau": predict.Options.MirostatTau,
  163. "mirostat_eta": predict.Options.MirostatEta,
  164. "penalize_nl": predict.Options.PenalizeNewline,
  165. "seed": predict.Options.Seed,
  166. "stop": predict.Options.Stop,
  167. "image_data": imageData,
  168. "cache_prompt": true,
  169. }
  170. if predict.Format == "json" {
  171. request["grammar"] = jsonGrammar
  172. }
  173. retryDelay := 100 * time.Microsecond
  174. for retries := 0; retries < maxRetries; retries++ {
  175. if retries > 0 {
  176. time.Sleep(retryDelay) // wait before retrying
  177. retryDelay *= 2 // exponential backoff
  178. }
  179. // Handling JSON marshaling with special characters unescaped.
  180. buffer := &bytes.Buffer{}
  181. enc := json.NewEncoder(buffer)
  182. enc.SetEscapeHTML(false)
  183. if err := enc.Encode(request); err != nil {
  184. return fmt.Errorf("failed to marshal data: %w", err)
  185. }
  186. req := C.CString(buffer.String())
  187. defer C.free(unsafe.Pointer(req))
  188. llm.llama_server_completion(req, &resp)
  189. if resp.id < 0 {
  190. return extServerResponseToErr(resp)
  191. }
  192. retryNeeded := false
  193. out:
  194. for {
  195. select {
  196. case <-ctx.Done():
  197. // This handles the request cancellation
  198. llm.llama_server_completion_cancel(resp.id, &resp)
  199. if resp.id < 0 {
  200. return extServerResponseToErr(resp)
  201. } else {
  202. return nil
  203. }
  204. default:
  205. var result C.ext_server_task_result_t
  206. llm.llama_server_completion_next_result(resp.id, &result)
  207. json_resp := C.GoString(result.json_resp)
  208. llm.llama_server_release_task_result(&result)
  209. var p prediction
  210. if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
  211. llm.llama_server_completion_cancel(resp.id, &resp)
  212. if resp.id < 0 {
  213. return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
  214. } else {
  215. return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
  216. }
  217. }
  218. if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
  219. retryNeeded = true
  220. // task will already be canceled
  221. break out
  222. }
  223. if p.Content != "" {
  224. fn(PredictResult{
  225. Content: p.Content,
  226. })
  227. }
  228. if p.Stop {
  229. fn(PredictResult{
  230. Done: true,
  231. PromptEvalCount: p.Timings.PromptN,
  232. PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
  233. EvalCount: p.Timings.PredictedN,
  234. EvalDuration: parseDurationMs(p.Timings.PredictedMS),
  235. })
  236. return nil
  237. }
  238. }
  239. }
  240. if !retryNeeded {
  241. return nil // success
  242. }
  243. }
  244. // should never reach here ideally
  245. return fmt.Errorf("max retries exceeded")
  246. }
  247. func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
  248. data, err := json.Marshal(TokenizeRequest{Content: prompt})
  249. if err != nil {
  250. return nil, fmt.Errorf("marshaling encode data: %w", err)
  251. }
  252. req := C.CString(string(data))
  253. defer C.free(unsafe.Pointer(req))
  254. var json_resp *C.char
  255. resp := newExtServerResp(128)
  256. defer freeExtServerResp(resp)
  257. llm.llama_server_tokenize(req, &json_resp, &resp)
  258. if resp.id < 0 {
  259. return nil, extServerResponseToErr(resp)
  260. }
  261. defer llm.llama_server_release_json_resp(&json_resp)
  262. var encoded TokenizeResponse
  263. if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
  264. return nil, fmt.Errorf("unmarshal encode response: %w", err2)
  265. }
  266. return encoded.Tokens, err
  267. }
  268. func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
  269. if len(tokens) == 0 {
  270. return "", nil
  271. }
  272. data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
  273. if err != nil {
  274. return "", fmt.Errorf("marshaling decode data: %w", err)
  275. }
  276. req := C.CString(string(data))
  277. defer C.free(unsafe.Pointer(req))
  278. var json_resp *C.char
  279. resp := newExtServerResp(128)
  280. defer freeExtServerResp(resp)
  281. llm.llama_server_detokenize(req, &json_resp, &resp)
  282. if resp.id < 0 {
  283. return "", extServerResponseToErr(resp)
  284. }
  285. defer llm.llama_server_release_json_resp(&json_resp)
  286. var decoded DetokenizeResponse
  287. if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
  288. return "", fmt.Errorf("unmarshal encode response: %w", err2)
  289. }
  290. return decoded.Content, err
  291. }
  292. func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
  293. data, err := json.Marshal(TokenizeRequest{Content: input})
  294. if err != nil {
  295. return nil, fmt.Errorf("error marshaling embed data: %w", err)
  296. }
  297. req := C.CString(string(data))
  298. defer C.free(unsafe.Pointer(req))
  299. var json_resp *C.char
  300. resp := newExtServerResp(128)
  301. defer freeExtServerResp(resp)
  302. llm.llama_server_embedding(req, &json_resp, &resp)
  303. if resp.id < 0 {
  304. return nil, extServerResponseToErr(resp)
  305. }
  306. defer llm.llama_server_release_json_resp(&json_resp)
  307. var embedding EmbeddingResponse
  308. if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
  309. return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
  310. }
  311. return embedding.Embedding, nil
  312. }
  313. func close(llm extServer) {
  314. llm.llama_server_stop()
  315. mutex.Unlock()
  316. }