dyn_ext_server.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. package llm
  2. /*
  3. #cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
  4. #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
  5. #cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
  6. #cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
  7. #cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
  8. #cgo darwin CPPFLAGS: -DGGML_USE_ACCELERATE
  9. #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
  10. #cgo darwin LDFLAGS: -lc++ -framework Accelerate
  11. #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
  12. #cgo linux CFLAGS: -D_GNU_SOURCE
  13. #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
  14. #cgo linux windows LDFLAGS: -lpthread
  15. #include <stdlib.h>
  16. #include "dyn_ext_server.h"
  17. */
  18. import "C"
  19. import (
  20. "bytes"
  21. "context"
  22. "encoding/json"
  23. "fmt"
  24. "log/slog"
  25. "os"
  26. "path/filepath"
  27. "strings"
  28. "sync"
  29. "time"
  30. "unsafe"
  31. "github.com/jmorganca/ollama/api"
  32. "github.com/jmorganca/ollama/gpu"
  33. )
  34. type dynExtServer struct {
  35. s C.struct_dynamic_llama_server
  36. options api.Options
  37. }
  38. // Note: current implementation does not support concurrent instantiations
  39. var mutex sync.Mutex
  40. func newExtServerResp(len C.size_t) C.ext_server_resp_t {
  41. var resp C.ext_server_resp_t
  42. resp.msg_len = len
  43. bytes := make([]byte, len)
  44. resp.msg = (*C.char)(C.CBytes(bytes))
  45. return resp
  46. }
  47. func freeExtServerResp(resp C.ext_server_resp_t) {
  48. if resp.msg_len == 0 {
  49. return
  50. }
  51. C.free(unsafe.Pointer(resp.msg))
  52. }
  53. func extServerResponseToErr(resp C.ext_server_resp_t) error {
  54. return fmt.Errorf(C.GoString(resp.msg))
  55. }
  56. // Note: current implementation does not support concurrent instantiations
  57. var llm *dynExtServer
  58. func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
  59. if !mutex.TryLock() {
  60. slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
  61. mutex.Lock()
  62. }
  63. gpu.UpdatePath(filepath.Dir(library))
  64. libPath := C.CString(library)
  65. defer C.free(unsafe.Pointer(libPath))
  66. resp := newExtServerResp(512)
  67. defer freeExtServerResp(resp)
  68. var srv C.struct_dynamic_llama_server
  69. C.dyn_init(libPath, &srv, &resp)
  70. if resp.id < 0 {
  71. mutex.Unlock()
  72. return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
  73. }
  74. llm = &dynExtServer{
  75. s: srv,
  76. options: opts,
  77. }
  78. slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
  79. var sparams C.ext_server_params_t
  80. sparams.model = C.CString(model)
  81. defer C.free(unsafe.Pointer(sparams.model))
  82. sparams.embedding = true
  83. sparams.n_ctx = C.uint(opts.NumCtx)
  84. sparams.n_batch = C.uint(opts.NumBatch)
  85. sparams.n_gpu_layers = C.int(opts.NumGPU)
  86. sparams.main_gpu = C.int(opts.MainGPU)
  87. sparams.n_parallel = 1 // TODO - wire up concurrency
  88. // Always use the value encoded in the model
  89. sparams.rope_freq_base = 0.0
  90. sparams.rope_freq_scale = 0.0
  91. sparams.memory_f16 = C.bool(opts.F16KV)
  92. sparams.use_mlock = C.bool(opts.UseMLock)
  93. sparams.use_mmap = C.bool(opts.UseMMap)
  94. if opts.UseNUMA {
  95. sparams.numa = C.int(1)
  96. } else {
  97. sparams.numa = C.int(0)
  98. }
  99. sparams.lora_adapters = nil
  100. for i := 0; i < len(adapters); i++ {
  101. la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
  102. defer C.free(unsafe.Pointer(la))
  103. la.adapter = C.CString(adapters[i])
  104. defer C.free(unsafe.Pointer(la.adapter))
  105. la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
  106. la.next = nil
  107. if i == 0 {
  108. sparams.lora_adapters = la
  109. } else {
  110. tmp := sparams.lora_adapters
  111. for ; tmp.next != nil; tmp = tmp.next {
  112. }
  113. tmp.next = la
  114. }
  115. }
  116. if len(projectors) > 0 {
  117. // TODO: applying multiple projectors is not supported by the llama.cpp server yet
  118. sparams.mmproj = C.CString(projectors[0])
  119. defer C.free(unsafe.Pointer(sparams.mmproj))
  120. } else {
  121. sparams.mmproj = nil
  122. }
  123. sparams.n_threads = C.uint(opts.NumThread)
  124. if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
  125. sparams.verbose_logging = C.bool(true)
  126. } else {
  127. sparams.verbose_logging = C.bool(false)
  128. }
  129. slog.Info("Initializing llama server")
  130. slog.Debug(fmt.Sprintf("server params: %+v", sparams))
  131. initResp := newExtServerResp(512)
  132. defer freeExtServerResp(initResp)
  133. C.dyn_llama_server_init(llm.s, &sparams, &initResp)
  134. if initResp.id < 0 {
  135. mutex.Unlock()
  136. err := extServerResponseToErr(initResp)
  137. slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
  138. return nil, err
  139. }
  140. slog.Info("Starting llama main loop")
  141. C.dyn_llama_server_start(llm.s)
  142. return llm, nil
  143. }
  144. func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
  145. resp := newExtServerResp(128)
  146. defer freeExtServerResp(resp)
  147. if len(predict.Images) > 0 {
  148. slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
  149. }
  150. request := map[string]any{
  151. "prompt": predict.Prompt,
  152. "stream": true,
  153. "n_predict": predict.Options.NumPredict,
  154. "n_keep": predict.Options.NumKeep,
  155. "temperature": predict.Options.Temperature,
  156. "top_k": predict.Options.TopK,
  157. "top_p": predict.Options.TopP,
  158. "tfs_z": predict.Options.TFSZ,
  159. "typical_p": predict.Options.TypicalP,
  160. "repeat_last_n": predict.Options.RepeatLastN,
  161. "repeat_penalty": predict.Options.RepeatPenalty,
  162. "presence_penalty": predict.Options.PresencePenalty,
  163. "frequency_penalty": predict.Options.FrequencyPenalty,
  164. "mirostat": predict.Options.Mirostat,
  165. "mirostat_tau": predict.Options.MirostatTau,
  166. "mirostat_eta": predict.Options.MirostatEta,
  167. "penalize_nl": predict.Options.PenalizeNewline,
  168. "seed": predict.Options.Seed,
  169. "stop": predict.Options.Stop,
  170. "image_data": predict.Images,
  171. "cache_prompt": true,
  172. }
  173. if predict.Format == "json" {
  174. request["grammar"] = jsonGrammar
  175. if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
  176. slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
  177. }
  178. }
  179. retryDelay := 100 * time.Microsecond
  180. for retries := 0; retries < maxRetries; retries++ {
  181. if retries > 0 {
  182. time.Sleep(retryDelay) // wait before retrying
  183. retryDelay *= 2 // exponential backoff
  184. }
  185. // Handling JSON marshaling with special characters unescaped.
  186. buffer := &bytes.Buffer{}
  187. enc := json.NewEncoder(buffer)
  188. enc.SetEscapeHTML(false)
  189. if err := enc.Encode(request); err != nil {
  190. return fmt.Errorf("failed to marshal data: %w", err)
  191. }
  192. req := C.CString(buffer.String())
  193. defer C.free(unsafe.Pointer(req))
  194. C.dyn_llama_server_completion(llm.s, req, &resp)
  195. if resp.id < 0 {
  196. return extServerResponseToErr(resp)
  197. }
  198. retryNeeded := false
  199. // keep track of the last token generated, this is used to abort if the model starts looping
  200. var lastToken string
  201. var tokenRepeat int
  202. out:
  203. for {
  204. select {
  205. case <-ctx.Done():
  206. return cancelCompletion(llm, resp)
  207. default:
  208. var result C.ext_server_task_result_t
  209. C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
  210. json_resp := C.GoString(result.json_resp)
  211. C.dyn_llama_server_release_task_result(llm.s, &result)
  212. var p prediction
  213. if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
  214. C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
  215. if resp.id < 0 {
  216. return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
  217. } else {
  218. return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
  219. }
  220. }
  221. if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
  222. retryNeeded = true
  223. // task will already be canceled
  224. break out
  225. }
  226. switch {
  227. case strings.TrimSpace(p.Content) == lastToken:
  228. tokenRepeat++
  229. default:
  230. lastToken = strings.TrimSpace(p.Content)
  231. tokenRepeat = 0
  232. }
  233. // 30 picked as an arbitrary max token repeat limit, modify as needed
  234. if tokenRepeat > 30 {
  235. slog.Debug("prediction aborted, token repeat limit reached")
  236. return cancelCompletion(llm, resp)
  237. }
  238. if p.Content != "" {
  239. fn(PredictResult{
  240. Content: p.Content,
  241. })
  242. }
  243. if p.Stop || bool(result.stop) {
  244. fn(PredictResult{
  245. Done: true,
  246. PromptEvalCount: p.Timings.PromptN,
  247. PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
  248. EvalCount: p.Timings.PredictedN,
  249. EvalDuration: parseDurationMs(p.Timings.PredictedMS),
  250. })
  251. return nil
  252. }
  253. }
  254. }
  255. if !retryNeeded {
  256. return nil // success
  257. }
  258. }
  259. // should never reach here ideally
  260. return fmt.Errorf("max retries exceeded")
  261. }
  262. func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
  263. C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
  264. if resp.id < 0 {
  265. return extServerResponseToErr(resp)
  266. } else {
  267. return nil
  268. }
  269. }
  270. func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
  271. data, err := json.Marshal(TokenizeRequest{Content: prompt})
  272. if err != nil {
  273. return nil, fmt.Errorf("marshaling encode data: %w", err)
  274. }
  275. req := C.CString(string(data))
  276. defer C.free(unsafe.Pointer(req))
  277. var json_resp *C.char
  278. resp := newExtServerResp(128)
  279. defer freeExtServerResp(resp)
  280. C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
  281. if resp.id < 0 {
  282. return nil, extServerResponseToErr(resp)
  283. }
  284. defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
  285. var encoded TokenizeResponse
  286. if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
  287. return nil, fmt.Errorf("unmarshal encode response: %w", err2)
  288. }
  289. return encoded.Tokens, err
  290. }
  291. func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
  292. if len(tokens) == 0 {
  293. return "", nil
  294. }
  295. data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
  296. if err != nil {
  297. return "", fmt.Errorf("marshaling decode data: %w", err)
  298. }
  299. req := C.CString(string(data))
  300. defer C.free(unsafe.Pointer(req))
  301. var json_resp *C.char
  302. resp := newExtServerResp(128)
  303. defer freeExtServerResp(resp)
  304. C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
  305. if resp.id < 0 {
  306. return "", extServerResponseToErr(resp)
  307. }
  308. defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
  309. var decoded DetokenizeResponse
  310. if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
  311. return "", fmt.Errorf("unmarshal encode response: %w", err2)
  312. }
  313. return decoded.Content, err
  314. }
  315. func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
  316. data, err := json.Marshal(TokenizeRequest{Content: input})
  317. if err != nil {
  318. return nil, fmt.Errorf("error marshaling embed data: %w", err)
  319. }
  320. req := C.CString(string(data))
  321. defer C.free(unsafe.Pointer(req))
  322. var json_resp *C.char
  323. resp := newExtServerResp(128)
  324. defer freeExtServerResp(resp)
  325. C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
  326. if resp.id < 0 {
  327. return nil, extServerResponseToErr(resp)
  328. }
  329. defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
  330. var embedding EmbeddingResponse
  331. if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
  332. return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
  333. }
  334. return embedding.Embedding, nil
  335. }
  336. func (llm *dynExtServer) Close() {
  337. C.dyn_llama_server_stop(llm.s)
  338. mutex.Unlock()
  339. }