gpu_info_cuda.c 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include "gpu_info_cuda.h"
  3. #include <string.h>
  4. #ifndef _WIN32
  5. const char *cuda_lib_paths[] = {
  6. "libnvidia-ml.so",
  7. "/usr/local/cuda/lib64/libnvidia-ml.so",
  8. "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
  9. "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
  10. "/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob?
  11. NULL,
  12. };
  13. #else
  14. const char *cuda_lib_paths[] = {
  15. "nvml.dll",
  16. "",
  17. NULL,
  18. };
  19. #endif
  20. #define CUDA_LOOKUP_SIZE 6
  21. void cuda_init(cuda_init_resp_t *resp) {
  22. nvmlReturn_t ret;
  23. resp->err = NULL;
  24. const int buflen = 256;
  25. char buf[buflen + 1];
  26. int i;
  27. struct lookup {
  28. char *s;
  29. void **p;
  30. } l[CUDA_LOOKUP_SIZE] = {
  31. {"nvmlInit_v2", (void *)&resp->ch.initFn},
  32. {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
  33. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
  34. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  35. {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
  36. {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  37. };
  38. for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
  39. resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  40. }
  41. if (!resp->ch.handle) {
  42. // TODO improve error message, as the LOAD_ERR will have typically have the
  43. // final path that was checked which might be confusing.
  44. char *msg = LOAD_ERR();
  45. snprintf(buf, buflen,
  46. "Unable to load %s library to query for Nvidia GPUs: %s",
  47. cuda_lib_paths[0], msg);
  48. free(msg);
  49. resp->err = strdup(buf);
  50. return;
  51. }
  52. for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
  53. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  54. if (!l[i].p) {
  55. UNLOAD_LIBRARY(resp->ch.handle);
  56. resp->ch.handle = NULL;
  57. char *msg = LOAD_ERR();
  58. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  59. msg);
  60. free(msg);
  61. resp->err = strdup(buf);
  62. return;
  63. }
  64. }
  65. ret = (*resp->ch.initFn)();
  66. if (ret != NVML_SUCCESS) {
  67. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  68. resp->err = strdup(buf);
  69. }
  70. return;
  71. }
  72. void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  73. resp->err = NULL;
  74. nvmlDevice_t device;
  75. nvmlMemory_t memInfo = {0};
  76. nvmlReturn_t ret;
  77. const int buflen = 256;
  78. char buf[buflen + 1];
  79. int i;
  80. if (h.handle == NULL) {
  81. resp->err = strdup("nvml handle sn't initialized");
  82. return;
  83. }
  84. unsigned int devices;
  85. ret = (*h.getCount)(&devices);
  86. if (ret != NVML_SUCCESS) {
  87. snprintf(buf, buflen, "unable to get device count: %d", ret);
  88. resp->err = strdup(buf);
  89. return;
  90. }
  91. resp->total = 0;
  92. resp->free = 0;
  93. for (i = 0; i < devices; i++) {
  94. ret = (*h.getHandle)(i, &device);
  95. if (ret != NVML_SUCCESS) {
  96. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  97. resp->err = strdup(buf);
  98. return;
  99. }
  100. ret = (*h.getMemInfo)(device, &memInfo);
  101. if (ret != NVML_SUCCESS) {
  102. snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
  103. resp->err = strdup(buf);
  104. return;
  105. }
  106. resp->total += memInfo.total;
  107. resp->free += memInfo.free;
  108. }
  109. }
  110. void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  111. resp->err = NULL;
  112. resp->major = 0;
  113. resp->minor = 0;
  114. nvmlDevice_t device;
  115. int major = 0;
  116. int minor = 0;
  117. nvmlReturn_t ret;
  118. const int buflen = 256;
  119. char buf[buflen + 1];
  120. int i;
  121. if (h.handle == NULL) {
  122. resp->err = strdup("nvml handle not initialized");
  123. return;
  124. }
  125. unsigned int devices;
  126. ret = (*h.getCount)(&devices);
  127. if (ret != NVML_SUCCESS) {
  128. snprintf(buf, buflen, "unable to get device count: %d", ret);
  129. resp->err = strdup(buf);
  130. return;
  131. }
  132. for (i = 0; i < devices; i++) {
  133. ret = (*h.getHandle)(i, &device);
  134. if (ret != NVML_SUCCESS) {
  135. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  136. resp->err = strdup(buf);
  137. return;
  138. }
  139. ret = (*h.getComputeCapability)(device, &major, &minor);
  140. if (ret != NVML_SUCCESS) {
  141. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  142. resp->err = strdup(buf);
  143. return;
  144. }
  145. // Report the lowest major.minor we detect as that limits our compatibility
  146. if (resp->major == 0 || resp->major > major ) {
  147. resp->major = major;
  148. resp->minor = minor;
  149. } else if ( resp->major == major && resp->minor > minor ) {
  150. resp->minor = minor;
  151. }
  152. }
  153. }
  154. #endif // __APPLE__