gpu_info_cuda.c 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include "gpu_info_cuda.h"
  3. #include <string.h>
  4. #ifndef _WIN32
  5. const char *cuda_lib_paths[] = {
  6. "libnvidia-ml.so",
  7. "/usr/lib/wsl/lib/libnvidia-ml.so", // TODO Maybe glob?
  8. "/usr/lib/wsl/lib/libnvidia-ml.so.1",
  9. "/usr/local/cuda/lib64/libnvidia-ml.so",
  10. "/usr/lib/libnvidia-ml.so",
  11. "/usr/lib/libnvidia-ml.so.1",
  12. "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
  13. "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so",
  14. "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
  15. "/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so",
  16. "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so",
  17. "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1",
  18. NULL,
  19. };
  20. #else
  21. const char *cuda_lib_paths[] = {
  22. "nvml.dll",
  23. "",
  24. NULL,
  25. };
  26. #endif
  27. #define CUDA_LOOKUP_SIZE 6
  28. void cuda_init(cuda_init_resp_t *resp) {
  29. nvmlReturn_t ret;
  30. resp->err = NULL;
  31. const int buflen = 256;
  32. char buf[buflen + 1];
  33. int i;
  34. struct lookup {
  35. char *s;
  36. void **p;
  37. } l[CUDA_LOOKUP_SIZE] = {
  38. {"nvmlInit_v2", (void *)&resp->ch.initFn},
  39. {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
  40. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
  41. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  42. {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
  43. {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  44. };
  45. for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
  46. resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  47. }
  48. if (!resp->ch.handle) {
  49. // TODO improve error message, as the LOAD_ERR will have typically have the
  50. // final path that was checked which might be confusing.
  51. char *msg = LOAD_ERR();
  52. snprintf(buf, buflen,
  53. "Unable to load %s library to query for Nvidia GPUs: %s",
  54. cuda_lib_paths[0], msg);
  55. free(msg);
  56. resp->err = strdup(buf);
  57. return;
  58. }
  59. for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
  60. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  61. if (!l[i].p) {
  62. UNLOAD_LIBRARY(resp->ch.handle);
  63. resp->ch.handle = NULL;
  64. char *msg = LOAD_ERR();
  65. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  66. msg);
  67. free(msg);
  68. resp->err = strdup(buf);
  69. return;
  70. }
  71. }
  72. ret = (*resp->ch.initFn)();
  73. if (ret != NVML_SUCCESS) {
  74. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  75. resp->err = strdup(buf);
  76. }
  77. return;
  78. }
  79. void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  80. resp->err = NULL;
  81. nvmlDevice_t device;
  82. nvmlMemory_t memInfo = {0};
  83. nvmlReturn_t ret;
  84. const int buflen = 256;
  85. char buf[buflen + 1];
  86. int i;
  87. if (h.handle == NULL) {
  88. resp->err = strdup("nvml handle sn't initialized");
  89. return;
  90. }
  91. ret = (*h.getCount)(&resp->count);
  92. if (ret != NVML_SUCCESS) {
  93. snprintf(buf, buflen, "unable to get device count: %d", ret);
  94. resp->err = strdup(buf);
  95. return;
  96. }
  97. resp->total = 0;
  98. resp->free = 0;
  99. for (i = 0; i < resp->count; i++) {
  100. ret = (*h.getHandle)(i, &device);
  101. if (ret != NVML_SUCCESS) {
  102. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  103. resp->err = strdup(buf);
  104. return;
  105. }
  106. ret = (*h.getMemInfo)(device, &memInfo);
  107. if (ret != NVML_SUCCESS) {
  108. snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
  109. resp->err = strdup(buf);
  110. return;
  111. }
  112. resp->total += memInfo.total;
  113. resp->free += memInfo.free;
  114. }
  115. }
  116. void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  117. resp->err = NULL;
  118. resp->major = 0;
  119. resp->minor = 0;
  120. nvmlDevice_t device;
  121. int major = 0;
  122. int minor = 0;
  123. nvmlReturn_t ret;
  124. const int buflen = 256;
  125. char buf[buflen + 1];
  126. int i;
  127. if (h.handle == NULL) {
  128. resp->err = strdup("nvml handle not initialized");
  129. return;
  130. }
  131. unsigned int devices;
  132. ret = (*h.getCount)(&devices);
  133. if (ret != NVML_SUCCESS) {
  134. snprintf(buf, buflen, "unable to get device count: %d", ret);
  135. resp->err = strdup(buf);
  136. return;
  137. }
  138. for (i = 0; i < devices; i++) {
  139. ret = (*h.getHandle)(i, &device);
  140. if (ret != NVML_SUCCESS) {
  141. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  142. resp->err = strdup(buf);
  143. return;
  144. }
  145. ret = (*h.getComputeCapability)(device, &major, &minor);
  146. if (ret != NVML_SUCCESS) {
  147. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  148. resp->err = strdup(buf);
  149. return;
  150. }
  151. // Report the lowest major.minor we detect as that limits our compatibility
  152. if (resp->major == 0 || resp->major > major ) {
  153. resp->major = major;
  154. resp->minor = minor;
  155. } else if ( resp->major == major && resp->minor > minor ) {
  156. resp->minor = minor;
  157. }
  158. }
  159. }
  160. #endif // __APPLE__