gpu_info_cuda.c 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include "gpu_info_cuda.h"
  3. #include <string.h>
  4. #define CUDA_LOOKUP_SIZE 12
  5. void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  6. nvmlReturn_t ret;
  7. resp->err = NULL;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[CUDA_LOOKUP_SIZE] = {
  15. {"nvmlInit_v2", (void *)&resp->ch.initFn},
  16. {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
  17. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
  18. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  19. {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
  20. {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  21. {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
  22. {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
  23. {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
  24. {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
  25. {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
  26. {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
  27. };
  28. resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  29. if (!resp->ch.handle) {
  30. char *msg = LOAD_ERR();
  31. snprintf(buf, buflen,
  32. "Unable to load %s library to query for Nvidia GPUs: %s",
  33. cuda_lib_path, msg);
  34. free(msg);
  35. resp->err = strdup(buf);
  36. return;
  37. }
  38. for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
  39. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  40. if (!l[i].p) {
  41. UNLOAD_LIBRARY(resp->ch.handle);
  42. resp->ch.handle = NULL;
  43. char *msg = LOAD_ERR();
  44. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  45. msg);
  46. free(msg);
  47. resp->err = strdup(buf);
  48. return;
  49. }
  50. }
  51. ret = (*resp->ch.initFn)();
  52. if (ret != NVML_SUCCESS) {
  53. UNLOAD_LIBRARY(resp->ch.handle);
  54. resp->ch.handle = NULL;
  55. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  56. resp->err = strdup(buf);
  57. }
  58. // Report driver version if we're in verbose mode, ignore errors
  59. ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
  60. if (ret != NVML_SUCCESS) {
  61. LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
  62. } else {
  63. LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
  64. }
  65. }
  66. void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  67. resp->err = NULL;
  68. nvmlDevice_t device;
  69. nvmlMemory_t memInfo = {0};
  70. nvmlReturn_t ret;
  71. const int buflen = 256;
  72. char buf[buflen + 1];
  73. int i;
  74. if (h.handle == NULL) {
  75. resp->err = strdup("nvml handle sn't initialized");
  76. return;
  77. }
  78. ret = (*h.getCount)(&resp->count);
  79. if (ret != NVML_SUCCESS) {
  80. snprintf(buf, buflen, "unable to get device count: %d", ret);
  81. resp->err = strdup(buf);
  82. return;
  83. }
  84. resp->total = 0;
  85. resp->free = 0;
  86. for (i = 0; i < resp->count; i++) {
  87. ret = (*h.getHandle)(i, &device);
  88. if (ret != NVML_SUCCESS) {
  89. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  90. resp->err = strdup(buf);
  91. return;
  92. }
  93. ret = (*h.getMemInfo)(device, &memInfo);
  94. if (ret != NVML_SUCCESS) {
  95. snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
  96. resp->err = strdup(buf);
  97. return;
  98. }
  99. if (h.verbose) {
  100. nvmlBrandType_t brand = 0;
  101. // When in verbose mode, report more information about
  102. // the card we discover, but don't fail on error
  103. ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
  104. if (ret != RSMI_STATUS_SUCCESS) {
  105. LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
  106. } else {
  107. LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
  108. }
  109. ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
  110. if (ret != RSMI_STATUS_SUCCESS) {
  111. LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
  112. } else {
  113. LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
  114. }
  115. ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
  116. if (ret != RSMI_STATUS_SUCCESS) {
  117. LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
  118. } else {
  119. LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
  120. }
  121. ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
  122. if (ret != RSMI_STATUS_SUCCESS) {
  123. LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
  124. } else {
  125. LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
  126. }
  127. ret = (*h.nvmlDeviceGetBrand)(device, &brand);
  128. if (ret != RSMI_STATUS_SUCCESS) {
  129. LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
  130. } else {
  131. LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
  132. }
  133. }
  134. LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
  135. LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
  136. resp->total += memInfo.total;
  137. resp->free += memInfo.free;
  138. }
  139. }
  140. void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  141. resp->err = NULL;
  142. resp->major = 0;
  143. resp->minor = 0;
  144. nvmlDevice_t device;
  145. int major = 0;
  146. int minor = 0;
  147. nvmlReturn_t ret;
  148. const int buflen = 256;
  149. char buf[buflen + 1];
  150. int i;
  151. if (h.handle == NULL) {
  152. resp->err = strdup("nvml handle not initialized");
  153. return;
  154. }
  155. unsigned int devices;
  156. ret = (*h.getCount)(&devices);
  157. if (ret != NVML_SUCCESS) {
  158. snprintf(buf, buflen, "unable to get device count: %d", ret);
  159. resp->err = strdup(buf);
  160. return;
  161. }
  162. for (i = 0; i < devices; i++) {
  163. ret = (*h.getHandle)(i, &device);
  164. if (ret != NVML_SUCCESS) {
  165. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  166. resp->err = strdup(buf);
  167. return;
  168. }
  169. ret = (*h.getComputeCapability)(device, &major, &minor);
  170. if (ret != NVML_SUCCESS) {
  171. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  172. resp->err = strdup(buf);
  173. return;
  174. }
  175. // Report the lowest major.minor we detect as that limits our compatibility
  176. if (resp->major == 0 || resp->major > major ) {
  177. resp->major = major;
  178. resp->minor = minor;
  179. } else if ( resp->major == major && resp->minor > minor ) {
  180. resp->minor = minor;
  181. }
  182. }
  183. }
  184. #endif // __APPLE__