01-cuda.diff 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. diff --git a/llama/ggml-backend.c b/llama/ggml-backend.c
  2. index 9e35ce98..179be840 100644
  3. --- a/llama/ggml-backend.c
  4. +++ b/llama/ggml-backend.c
  5. @@ -87,7 +87,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
  6. if (buffer->iface.free_buffer != NULL) {
  7. buffer->iface.free_buffer(buffer);
  8. }
  9. +
  10. +// TODO: this needs to be freed in cuda and hipblas backends because
  11. +// the cuda backend implementation compiled with msvc
  12. +#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
  13. free(buffer);
  14. +#endif
  15. }
  16. size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
  17. diff --git a/llama/ggml-cuda.cu b/llama/ggml-cuda.cu
  18. index 04b6e528..43b12bdf 100644
  19. --- a/llama/ggml-cuda.cu
  20. +++ b/llama/ggml-cuda.cu
  21. @@ -392,6 +392,10 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)
  22. GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
  23. ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
  24. delete ctx;
  25. +
  26. + // TODO: this needs to be freed in cuda and hipblas backends because
  27. + // the cuda backend implementation compiled with msvc
  28. + free(buffer);
  29. }
  30. GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
  31. @@ -3028,8 +3032,6 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
  32. GGML_UNUSED(params);
  33. }
  34. -extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
  35. -
  36. GGML_CALL int ggml_backend_cuda_reg_devices() {
  37. int device_count = ggml_backend_cuda_get_device_count();
  38. //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
  39. diff --git a/llama/ggml-cuda.h b/llama/ggml-cuda.h
  40. index 5eb4af40..50b91009 100644
  41. --- a/llama/ggml-cuda.h
  42. +++ b/llama/ggml-cuda.h
  43. @@ -31,6 +31,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
  44. // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
  45. GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
  46. +GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
  47. +
  48. GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
  49. GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
  50. GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);