gpu_info_cuda.c 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include "gpu_info_cuda.h"
  3. #include <string.h>
  4. #define CUDA_LOOKUP_SIZE 6
  5. void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  6. nvmlReturn_t ret;
  7. resp->err = NULL;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[CUDA_LOOKUP_SIZE] = {
  15. {"nvmlInit_v2", (void *)&resp->ch.initFn},
  16. {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
  17. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
  18. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  19. {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
  20. {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  21. };
  22. resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  23. if (!resp->ch.handle) {
  24. char *msg = LOAD_ERR();
  25. snprintf(buf, buflen,
  26. "Unable to load %s library to query for Nvidia GPUs: %s",
  27. cuda_lib_path, msg);
  28. free(msg);
  29. resp->err = strdup(buf);
  30. return;
  31. }
  32. for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
  33. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  34. if (!l[i].p) {
  35. UNLOAD_LIBRARY(resp->ch.handle);
  36. resp->ch.handle = NULL;
  37. char *msg = LOAD_ERR();
  38. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  39. msg);
  40. free(msg);
  41. resp->err = strdup(buf);
  42. return;
  43. }
  44. }
  45. ret = (*resp->ch.initFn)();
  46. if (ret != NVML_SUCCESS) {
  47. UNLOAD_LIBRARY(resp->ch.handle);
  48. resp->ch.handle = NULL;
  49. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  50. resp->err = strdup(buf);
  51. }
  52. return;
  53. }
  54. void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  55. resp->err = NULL;
  56. nvmlDevice_t device;
  57. nvmlMemory_t memInfo = {0};
  58. nvmlReturn_t ret;
  59. const int buflen = 256;
  60. char buf[buflen + 1];
  61. int i;
  62. if (h.handle == NULL) {
  63. resp->err = strdup("nvml handle sn't initialized");
  64. return;
  65. }
  66. ret = (*h.getCount)(&resp->count);
  67. if (ret != NVML_SUCCESS) {
  68. snprintf(buf, buflen, "unable to get device count: %d", ret);
  69. resp->err = strdup(buf);
  70. return;
  71. }
  72. resp->total = 0;
  73. resp->free = 0;
  74. for (i = 0; i < resp->count; i++) {
  75. ret = (*h.getHandle)(i, &device);
  76. if (ret != NVML_SUCCESS) {
  77. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  78. resp->err = strdup(buf);
  79. return;
  80. }
  81. ret = (*h.getMemInfo)(device, &memInfo);
  82. if (ret != NVML_SUCCESS) {
  83. snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
  84. resp->err = strdup(buf);
  85. return;
  86. }
  87. resp->total += memInfo.total;
  88. resp->free += memInfo.free;
  89. }
  90. }
  91. void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  92. resp->err = NULL;
  93. resp->major = 0;
  94. resp->minor = 0;
  95. nvmlDevice_t device;
  96. int major = 0;
  97. int minor = 0;
  98. nvmlReturn_t ret;
  99. const int buflen = 256;
  100. char buf[buflen + 1];
  101. int i;
  102. if (h.handle == NULL) {
  103. resp->err = strdup("nvml handle not initialized");
  104. return;
  105. }
  106. unsigned int devices;
  107. ret = (*h.getCount)(&devices);
  108. if (ret != NVML_SUCCESS) {
  109. snprintf(buf, buflen, "unable to get device count: %d", ret);
  110. resp->err = strdup(buf);
  111. return;
  112. }
  113. for (i = 0; i < devices; i++) {
  114. ret = (*h.getHandle)(i, &device);
  115. if (ret != NVML_SUCCESS) {
  116. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  117. resp->err = strdup(buf);
  118. return;
  119. }
  120. ret = (*h.getComputeCapability)(device, &major, &minor);
  121. if (ret != NVML_SUCCESS) {
  122. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  123. resp->err = strdup(buf);
  124. return;
  125. }
  126. // Report the lowest major.minor we detect as that limits our compatibility
  127. if (resp->major == 0 || resp->major > major ) {
  128. resp->major = major;
  129. resp->minor = minor;
  130. } else if ( resp->major == major && resp->minor > minor ) {
  131. resp->minor = minor;
  132. }
  133. }
  134. }
  135. #endif // __APPLE__