gpu_info_cuda.c 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include "gpu_info_cuda.h"
  3. #include <string.h>
  4. #ifndef _WIN32
  5. const char *cuda_lib_paths[] = {
  6. "libnvidia-ml.so",
  7. "/usr/local/cuda/lib64/libnvidia-ml.so",
  8. "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
  9. "/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob?
  10. NULL,
  11. };
  12. #else
  13. const char *cuda_lib_paths[] = {
  14. "nvml.dll",
  15. "",
  16. NULL,
  17. };
  18. #endif
  19. #define CUDA_LOOKUP_SIZE 5
  20. void cuda_init(cuda_init_resp_t *resp) {
  21. nvmlReturn_t ret;
  22. resp->err = NULL;
  23. const int buflen = 256;
  24. char buf[buflen + 1];
  25. int i;
  26. struct lookup {
  27. char *s;
  28. void **p;
  29. } l[CUDA_LOOKUP_SIZE] = {
  30. {"nvmlInit_v2", (void *)&resp->ch.initFn},
  31. {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
  32. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
  33. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  34. {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
  35. };
  36. for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
  37. resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  38. }
  39. if (!resp->ch.handle) {
  40. // TODO improve error message, as the LOAD_ERR will have typically have the
  41. // final path that was checked which might be confusing.
  42. char *msg = LOAD_ERR();
  43. snprintf(buf, buflen,
  44. "Unable to load %s library to query for Nvidia GPUs: %s",
  45. cuda_lib_paths[0], msg);
  46. free(msg);
  47. resp->err = strdup(buf);
  48. return;
  49. }
  50. for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
  51. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  52. if (!l[i].p) {
  53. UNLOAD_LIBRARY(resp->ch.handle);
  54. resp->ch.handle = NULL;
  55. char *msg = LOAD_ERR();
  56. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  57. msg);
  58. free(msg);
  59. resp->err = strdup(buf);
  60. return;
  61. }
  62. }
  63. ret = (*resp->ch.initFn)();
  64. if (ret != NVML_SUCCESS) {
  65. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  66. resp->err = strdup(buf);
  67. }
  68. return;
  69. }
  70. void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  71. resp->err = NULL;
  72. nvmlDevice_t device;
  73. nvmlMemory_t memInfo = {0};
  74. nvmlReturn_t ret;
  75. const int buflen = 256;
  76. char buf[buflen + 1];
  77. int i;
  78. if (h.handle == NULL) {
  79. resp->err = strdup("nvml handle sn't initialized");
  80. return;
  81. }
  82. unsigned int devices;
  83. ret = (*h.getCount)(&devices);
  84. if (ret != NVML_SUCCESS) {
  85. snprintf(buf, buflen, "unable to get device count: %d", ret);
  86. resp->err = strdup(buf);
  87. return;
  88. }
  89. resp->total = 0;
  90. resp->free = 0;
  91. for (i = 0; i < devices; i++) {
  92. ret = (*h.getHandle)(i, &device);
  93. if (ret != NVML_SUCCESS) {
  94. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  95. resp->err = strdup(buf);
  96. return;
  97. }
  98. ret = (*h.getMemInfo)(device, &memInfo);
  99. if (ret != NVML_SUCCESS) {
  100. snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
  101. resp->err = strdup(buf);
  102. return;
  103. }
  104. resp->total += memInfo.total;
  105. resp->free += memInfo.free;
  106. }
  107. }
  108. #endif // __APPLE__