ggml-backend.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. /**
  2. * llama.cpp - git 465219b9143ac01db0990bbcb0a081ef72ec2008
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023 Georgi Gerganov
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "ggml-backend.h"
  27. #include "ggml-alloc.h"
  28. #include <assert.h>
  29. #include <stdarg.h>
  30. #include <stdio.h>
  31. #include <stdlib.h>
  32. #include <string.h>
  33. #define UNUSED GGML_UNUSED
  34. #define MAX(a, b) ((a) > (b) ? (a) : (b))
  35. // backend buffer
  36. ggml_backend_buffer_t ggml_backend_buffer_init(
  37. struct ggml_backend * backend,
  38. struct ggml_backend_buffer_i iface,
  39. ggml_backend_buffer_context_t context,
  40. size_t size) {
  41. ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
  42. GGML_ASSERT(iface.get_base != NULL);
  43. (*buffer) = (struct ggml_backend_buffer) {
  44. /* .interface = */ iface,
  45. /* .backend = */ backend,
  46. /* .context = */ context,
  47. /* .size = */ size,
  48. };
  49. return buffer;
  50. }
  51. void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
  52. if (buffer->iface.free_buffer != NULL) {
  53. buffer->iface.free_buffer(buffer);
  54. }
  55. free(buffer);
  56. }
  57. size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
  58. return ggml_backend_get_alignment(buffer->backend);
  59. }
  60. void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
  61. return buffer->iface.get_base(buffer);
  62. }
  63. size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
  64. return buffer->size;
  65. }
  66. size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  67. if (buffer->iface.get_alloc_size) {
  68. return buffer->iface.get_alloc_size(buffer, tensor);
  69. }
  70. return ggml_nbytes(tensor);
  71. }
  72. void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  73. if (buffer->iface.init_tensor) {
  74. buffer->iface.init_tensor(buffer, tensor);
  75. }
  76. }
  77. void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  78. if (buffer->iface.free_tensor) {
  79. buffer->iface.free_tensor(buffer, tensor);
  80. }
  81. }
  82. // backend
  83. ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
  84. return tensor->buffer->backend;
  85. }
  86. const char * ggml_backend_name(ggml_backend_t backend) {
  87. return backend->iface.get_name(backend);
  88. }
  89. void ggml_backend_free(ggml_backend_t backend) {
  90. backend->iface.free(backend);
  91. }
  92. ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
  93. return backend->iface.alloc_buffer(backend, size);
  94. }
  95. size_t ggml_backend_get_alignment(ggml_backend_t backend) {
  96. return backend->iface.get_alignment(backend);
  97. }
  98. void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  99. ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
  100. }
  101. void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  102. ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
  103. }
  104. void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  105. ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
  106. ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
  107. }
  108. void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  109. ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
  110. ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
  111. }
  112. void ggml_backend_synchronize(ggml_backend_t backend) {
  113. backend->iface.synchronize(backend);
  114. }
  115. ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  116. return backend->iface.graph_plan_create(backend, cgraph);
  117. }
  118. void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  119. backend->iface.graph_plan_free(backend, plan);
  120. }
  121. void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  122. backend->iface.graph_plan_compute(backend, plan);
  123. }
  124. void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  125. backend->iface.graph_compute(backend, cgraph);
  126. }
  127. bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
  128. return backend->iface.supports_op(backend, op);
  129. }
  130. // backend copy
  131. static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
  132. if (a->type != b->type) {
  133. return false;
  134. }
  135. for (int i = 0; i < GGML_MAX_DIMS; i++) {
  136. if (a->ne[i] != b->ne[i]) {
  137. return false;
  138. }
  139. if (a->nb[i] != b->nb[i]) {
  140. return false;
  141. }
  142. }
  143. return true;
  144. }
  145. void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
  146. //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
  147. //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
  148. GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
  149. // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
  150. if (src == dst) {
  151. return;
  152. }
  153. // TODO: allow backends to support copy to/from same backend
  154. if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
  155. ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
  156. } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
  157. ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
  158. } else {
  159. // shouldn't be hit when copying from/to CPU
  160. #ifndef NDEBUG
  161. fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
  162. #endif
  163. size_t nbytes = ggml_nbytes(src);
  164. void * data = malloc(nbytes);
  165. ggml_backend_tensor_get(src, data, 0, nbytes);
  166. ggml_backend_tensor_set(dst, data, 0, nbytes);
  167. free(data);
  168. }
  169. }
  170. // backend CPU
  171. struct ggml_backend_cpu_context {
  172. int n_threads;
  173. void * work_data;
  174. size_t work_size;
  175. };
  176. static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
  177. return "CPU";
  178. UNUSED(backend);
  179. }
  180. static void ggml_backend_cpu_free(ggml_backend_t backend) {
  181. struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  182. free(cpu_ctx->work_data);
  183. free(cpu_ctx);
  184. free(backend);
  185. }
  186. static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
  187. return (void *)buffer->context;
  188. }
  189. static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  190. free(buffer->context);
  191. UNUSED(buffer);
  192. }
  193. static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
  194. /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
  195. /* .get_base = */ ggml_backend_cpu_buffer_get_base,
  196. /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
  197. /* .init_tensor = */ NULL, // no initialization required
  198. /* .free_tensor = */ NULL, // no cleanup required
  199. };
  200. // for buffers from ptr, free is not called
  201. static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
  202. /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
  203. /* .get_base = */ ggml_backend_cpu_buffer_get_base,
  204. /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
  205. /* .init_tensor = */ NULL,
  206. /* .free_tensor = */ NULL,
  207. };
  208. static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
  209. static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
  210. size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
  211. void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
  212. return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
  213. }
  214. static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
  215. return TENSOR_ALIGNMENT;
  216. UNUSED(backend);
  217. }
  218. static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
  219. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
  220. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  221. memcpy((char *)tensor->data + offset, data, size);
  222. UNUSED(backend);
  223. }
  224. static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
  225. GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
  226. GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
  227. memcpy(data, (const char *)tensor->data + offset, size);
  228. UNUSED(backend);
  229. }
  230. static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
  231. UNUSED(backend);
  232. }
  233. static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
  234. ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
  235. UNUSED(backend);
  236. }
  237. static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
  238. // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
  239. ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
  240. UNUSED(backend);
  241. }
  242. struct ggml_backend_plan_cpu {
  243. struct ggml_cplan cplan;
  244. struct ggml_cgraph cgraph;
  245. };
  246. static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  247. struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  248. struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
  249. cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
  250. cpu_plan->cgraph = *cgraph;
  251. if (cpu_plan->cplan.work_size > 0) {
  252. cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
  253. }
  254. return cpu_plan;
  255. }
  256. static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  257. struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
  258. free(cpu_plan->cplan.work_data);
  259. free(cpu_plan);
  260. UNUSED(backend);
  261. }
  262. static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
  263. struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
  264. ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
  265. UNUSED(backend);
  266. }
  267. static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
  268. struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
  269. struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
  270. if (cpu_ctx->work_size < cplan.work_size) {
  271. // TODO: may be faster to free and use malloc to avoid the copy
  272. cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
  273. cpu_ctx->work_size = cplan.work_size;
  274. }
  275. cplan.work_data = cpu_ctx->work_data;
  276. ggml_graph_compute(cgraph, &cplan);
  277. }
  278. static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
  279. return true;
  280. UNUSED(backend);
  281. UNUSED(op);
  282. }
  283. static struct ggml_backend_i cpu_backend_i = {
  284. /* .get_name = */ ggml_backend_cpu_name,
  285. /* .free = */ ggml_backend_cpu_free,
  286. /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
  287. /* .get_alignment = */ ggml_backend_cpu_get_alignment,
  288. /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
  289. /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
  290. /* .synchronize = */ ggml_backend_cpu_synchronize,
  291. /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
  292. /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
  293. /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
  294. /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
  295. /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
  296. /* .graph_compute = */ ggml_backend_cpu_graph_compute,
  297. /* .supports_op = */ ggml_backend_cpu_supports_op,
  298. };
  299. ggml_backend_t ggml_backend_cpu_init(void) {
  300. struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
  301. ctx->n_threads = GGML_DEFAULT_N_THREADS;
  302. ctx->work_data = NULL;
  303. ctx->work_size = 0;
  304. ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
  305. *cpu_backend = (struct ggml_backend) {
  306. /* .interface = */ cpu_backend_i,
  307. /* .context = */ ctx
  308. };
  309. return cpu_backend;
  310. }
  311. bool ggml_backend_is_cpu(ggml_backend_t backend) {
  312. return backend->iface.get_name == ggml_backend_cpu_name;
  313. }
  314. void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
  315. GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
  316. struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
  317. ctx->n_threads = n_threads;
  318. }
  319. ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
  320. return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
  321. }