ext_server.h 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #if defined(LLAMA_SERVER_LIBRARY)
  2. #ifndef LLAMA_SERVER_H
  3. #define LLAMA_SERVER_H
  4. #include <stdbool.h>
  5. #include <stddef.h>
  6. #include <stdint.h>
  7. #include <stdio.h>
  8. int __main(int argc, char **argv);
  9. // This exposes extern C entrypoints into the llama_server
  10. // To enable the server compile with LLAMA_SERVER_LIBRARY
  11. #ifdef __cplusplus
  12. extern "C" {
  13. #endif
  14. typedef struct ext_server_resp {
  15. int id; // < 0 on error
  16. size_t msg_len; // caller must allocate msg and set msg_len
  17. char *msg;
  18. } ext_server_resp_t;
  19. // Allocated and freed by caller
  20. typedef struct ext_server_lora_adapter {
  21. char *adapter;
  22. float scale;
  23. struct ext_server_lora_adapter *next;
  24. } ext_server_lora_adapter_t;
  25. // Allocated and freed by caller
  26. typedef struct ext_server_params {
  27. char *model;
  28. uint32_t n_ctx; // token context window, 0 = from model
  29. uint32_t n_batch; // prompt processing maximum batch size
  30. uint32_t n_threads; // number of threads to use for generation
  31. int32_t n_parallel; // number of parallel sequences to decodewra
  32. float rope_freq_base; // RoPE base frequency, 0 = from model
  33. float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
  34. bool memory_f16; // use f16 instead of f32 for memory kv
  35. int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default)
  36. int32_t main_gpu; // the GPU that is used for scratch and small tensors
  37. bool use_mlock; // force system to keep model in RAM
  38. bool use_mmap; // use mmap if possible
  39. bool numa; // attempt optimizations that help on some NUMA systems
  40. bool embedding; // get only sentence embedding
  41. ext_server_lora_adapter_t *lora_adapters;
  42. char *mmproj;
  43. } ext_server_params_t;
  44. typedef struct ext_server_task_result {
  45. int id;
  46. bool stop;
  47. bool error;
  48. char *json_resp; // null terminated, memory managed by ext_server
  49. } ext_server_task_result_t;
  50. // Initialize the server once per process
  51. // err->id = 0 for success and err->msg[0] = NULL
  52. // err->id != 0 for failure, and err->msg contains error message
  53. void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
  54. // Run the main loop, called once per init
  55. void llama_server_start();
  56. // Stop the main loop and free up resources allocated in init and start. Init
  57. // must be called again to reuse
  58. void llama_server_stop();
  59. // json_req null terminated string, memory managed by caller
  60. // resp->id >= 0 on success (task ID)
  61. // resp->id < 0 on error, and resp->msg contains error message
  62. void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
  63. // Caller must call llama_server_release_task_result to free resp->json_resp
  64. void llama_server_completion_next_result(const int task_id,
  65. ext_server_task_result_t *result);
  66. void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
  67. void llama_server_release_task_result(ext_server_task_result_t *result);
  68. // Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
  69. // 0
  70. void llama_server_tokenize(const char *json_req, char **json_resp,
  71. ext_server_resp_t *err);
  72. void llama_server_detokenize(const char *json_req, char **json_resp,
  73. ext_server_resp_t *err);
  74. void llama_server_embedding(const char *json_req, char **json_resp,
  75. ext_server_resp_t *err);
  76. void llama_server_release_json_resp(char **json_resp);
  77. #ifdef __cplusplus
  78. }
  79. #endif
  80. #endif
  81. #endif // LLAMA_SERVER_LIBRARY