gpu_info_cuda.c 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include "gpu_info_cuda.h"
  3. #include <string.h>
  4. void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  5. nvmlReturn_t ret;
  6. resp->err = NULL;
  7. const int buflen = 256;
  8. char buf[buflen + 1];
  9. int i;
  10. struct lookup {
  11. char *s;
  12. void **p;
  13. } l[] = {
  14. {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
  15. {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
  16. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
  17. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
  18. {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
  19. {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
  20. {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
  21. {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
  22. {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
  23. {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
  24. {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
  25. {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
  26. {NULL, NULL},
  27. };
  28. resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  29. if (!resp->ch.handle) {
  30. char *msg = LOAD_ERR();
  31. LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
  32. snprintf(buf, buflen,
  33. "Unable to load %s library to query for Nvidia GPUs: %s",
  34. cuda_lib_path, msg);
  35. free(msg);
  36. resp->err = strdup(buf);
  37. return;
  38. }
  39. // TODO once we've squashed the remaining corner cases remove this log
  40. LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
  41. for (i = 0; l[i].s != NULL; i++) {
  42. // TODO once we've squashed the remaining corner cases remove this log
  43. LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
  44. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  45. if (!l[i].p) {
  46. resp->ch.handle = NULL;
  47. char *msg = LOAD_ERR();
  48. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  49. UNLOAD_LIBRARY(resp->ch.handle);
  50. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  51. msg);
  52. free(msg);
  53. resp->err = strdup(buf);
  54. return;
  55. }
  56. }
  57. ret = (*resp->ch.nvmlInit_v2)();
  58. if (ret != NVML_SUCCESS) {
  59. LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
  60. UNLOAD_LIBRARY(resp->ch.handle);
  61. resp->ch.handle = NULL;
  62. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  63. resp->err = strdup(buf);
  64. return;
  65. }
  66. // Report driver version if we're in verbose mode, ignore errors
  67. ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
  68. if (ret != NVML_SUCCESS) {
  69. LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
  70. } else {
  71. LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
  72. }
  73. }
  74. void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  75. resp->err = NULL;
  76. nvmlDevice_t device;
  77. nvmlMemory_t memInfo = {0};
  78. nvmlReturn_t ret;
  79. const int buflen = 256;
  80. char buf[buflen + 1];
  81. int i;
  82. if (h.handle == NULL) {
  83. resp->err = strdup("nvml handle sn't initialized");
  84. return;
  85. }
  86. ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
  87. if (ret != NVML_SUCCESS) {
  88. snprintf(buf, buflen, "unable to get device count: %d", ret);
  89. resp->err = strdup(buf);
  90. return;
  91. }
  92. resp->total = 0;
  93. resp->free = 0;
  94. for (i = 0; i < resp->count; i++) {
  95. ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
  96. if (ret != NVML_SUCCESS) {
  97. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  98. resp->err = strdup(buf);
  99. return;
  100. }
  101. ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
  102. if (ret != NVML_SUCCESS) {
  103. snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
  104. resp->err = strdup(buf);
  105. return;
  106. }
  107. if (h.verbose) {
  108. nvmlBrandType_t brand = 0;
  109. // When in verbose mode, report more information about
  110. // the card we discover, but don't fail on error
  111. ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
  112. if (ret != RSMI_STATUS_SUCCESS) {
  113. LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
  114. } else {
  115. LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
  116. }
  117. ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
  118. if (ret != RSMI_STATUS_SUCCESS) {
  119. LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
  120. } else {
  121. LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
  122. }
  123. ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
  124. if (ret != RSMI_STATUS_SUCCESS) {
  125. LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
  126. } else {
  127. LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
  128. }
  129. ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
  130. if (ret != RSMI_STATUS_SUCCESS) {
  131. LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
  132. } else {
  133. LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
  134. }
  135. ret = (*h.nvmlDeviceGetBrand)(device, &brand);
  136. if (ret != RSMI_STATUS_SUCCESS) {
  137. LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
  138. } else {
  139. LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
  140. }
  141. }
  142. LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
  143. LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
  144. resp->total += memInfo.total;
  145. resp->free += memInfo.free;
  146. }
  147. }
  148. void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  149. resp->err = NULL;
  150. resp->major = 0;
  151. resp->minor = 0;
  152. nvmlDevice_t device;
  153. int major = 0;
  154. int minor = 0;
  155. nvmlReturn_t ret;
  156. const int buflen = 256;
  157. char buf[buflen + 1];
  158. int i;
  159. if (h.handle == NULL) {
  160. resp->err = strdup("nvml handle not initialized");
  161. return;
  162. }
  163. unsigned int devices;
  164. ret = (*h.nvmlDeviceGetCount_v2)(&devices);
  165. if (ret != NVML_SUCCESS) {
  166. snprintf(buf, buflen, "unable to get device count: %d", ret);
  167. resp->err = strdup(buf);
  168. return;
  169. }
  170. for (i = 0; i < devices; i++) {
  171. ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
  172. if (ret != NVML_SUCCESS) {
  173. snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
  174. resp->err = strdup(buf);
  175. return;
  176. }
  177. ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
  178. if (ret != NVML_SUCCESS) {
  179. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  180. resp->err = strdup(buf);
  181. return;
  182. }
  183. // Report the lowest major.minor we detect as that limits our compatibility
  184. if (resp->major == 0 || resp->major > major ) {
  185. resp->major = major;
  186. resp->minor = minor;
  187. } else if ( resp->major == major && resp->minor > minor ) {
  188. resp->minor = minor;
  189. }
  190. }
  191. }
  192. #endif // __APPLE__