gpu_info_cudart.c 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_cudart.h"
  4. void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  5. cudartReturn_t ret;
  6. resp->err = NULL;
  7. const int buflen = 256;
  8. char buf[buflen + 1];
  9. int i;
  10. struct lookup {
  11. char *s;
  12. void **p;
  13. } l[] = {
  14. {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
  15. {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
  16. {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
  17. {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
  18. {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
  19. {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
  20. {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
  21. {NULL, NULL},
  22. };
  23. resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
  24. if (!resp->ch.handle) {
  25. char *msg = LOAD_ERR();
  26. LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
  27. snprintf(buf, buflen,
  28. "Unable to load %s library to query for Nvidia GPUs: %s",
  29. cudart_lib_path, msg);
  30. free(msg);
  31. resp->err = strdup(buf);
  32. return;
  33. }
  34. // TODO once we've squashed the remaining corner cases remove this log
  35. LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
  36. for (i = 0; l[i].s != NULL; i++) {
  37. // TODO once we've squashed the remaining corner cases remove this log
  38. LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
  39. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  40. if (!l[i].p) {
  41. char *msg = LOAD_ERR();
  42. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  43. UNLOAD_LIBRARY(resp->ch.handle);
  44. resp->ch.handle = NULL;
  45. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  46. msg);
  47. free(msg);
  48. resp->err = strdup(buf);
  49. return;
  50. }
  51. }
  52. ret = (*resp->ch.cudaSetDevice)(0);
  53. if (ret != CUDART_SUCCESS) {
  54. LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
  55. UNLOAD_LIBRARY(resp->ch.handle);
  56. resp->ch.handle = NULL;
  57. snprintf(buf, buflen, "cudart init failure: %d", ret);
  58. resp->err = strdup(buf);
  59. return;
  60. }
  61. int version = 0;
  62. cudartDriverVersion_t driverVersion;
  63. driverVersion.major = 0;
  64. driverVersion.minor = 0;
  65. // Report driver version if we're in verbose mode, ignore errors
  66. ret = (*resp->ch.cudaDriverGetVersion)(&version);
  67. if (ret != CUDART_SUCCESS) {
  68. LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
  69. } else {
  70. driverVersion.major = version / 1000;
  71. driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
  72. LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  73. }
  74. }
  75. void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
  76. resp->err = NULL;
  77. cudartMemory_t memInfo = {0,0,0};
  78. cudartReturn_t ret;
  79. const int buflen = 256;
  80. char buf[buflen + 1];
  81. int i;
  82. if (h.handle == NULL) {
  83. resp->err = strdup("cudart handle isn't initialized");
  84. return;
  85. }
  86. // cudaGetDeviceCount takes int type, resp-> count is uint
  87. int deviceCount;
  88. ret = (*h.cudaGetDeviceCount)(&deviceCount);
  89. if (ret != CUDART_SUCCESS) {
  90. snprintf(buf, buflen, "unable to get device count: %d", ret);
  91. resp->err = strdup(buf);
  92. return;
  93. } else {
  94. resp->count = (unsigned int)deviceCount;
  95. }
  96. resp->total = 0;
  97. resp->free = 0;
  98. for (i = 0; i < resp-> count; i++) {
  99. ret = (*h.cudaSetDevice)(i);
  100. if (ret != CUDART_SUCCESS) {
  101. snprintf(buf, buflen, "cudart device failed to initialize");
  102. resp->err = strdup(buf);
  103. return;
  104. }
  105. ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
  106. if (ret != CUDART_SUCCESS) {
  107. snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
  108. resp->err = strdup(buf);
  109. return;
  110. }
  111. LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
  112. LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
  113. resp->total += memInfo.total;
  114. resp->free += memInfo.free;
  115. }
  116. }
  117. void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
  118. resp->err = NULL;
  119. resp->major = 0;
  120. resp->minor = 0;
  121. int major = 0;
  122. int minor = 0;
  123. cudartReturn_t ret;
  124. const int buflen = 256;
  125. char buf[buflen + 1];
  126. int i;
  127. if (h.handle == NULL) {
  128. resp->err = strdup("cudart handle not initialized");
  129. return;
  130. }
  131. int devices;
  132. ret = (*h.cudaGetDeviceCount)(&devices);
  133. if (ret != CUDART_SUCCESS) {
  134. snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
  135. resp->err = strdup(buf);
  136. return;
  137. }
  138. for (i = 0; i < devices; i++) {
  139. ret = (*h.cudaSetDevice)(i);
  140. if (ret != CUDART_SUCCESS) {
  141. snprintf(buf, buflen, "cudart device failed to initialize");
  142. resp->err = strdup(buf);
  143. return;
  144. }
  145. ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
  146. if (ret != CUDART_SUCCESS) {
  147. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  148. resp->err = strdup(buf);
  149. return;
  150. }
  151. ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
  152. if (ret != CUDART_SUCCESS) {
  153. snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
  154. resp->err = strdup(buf);
  155. return;
  156. }
  157. // Report the lowest major.minor we detect as that limits our compatibility
  158. if (resp->major == 0 || resp->major > major ) {
  159. resp->major = major;
  160. resp->minor = minor;
  161. } else if ( resp->major == major && resp->minor > minor ) {
  162. resp->minor = minor;
  163. }
  164. }
  165. }
  166. #endif // __APPLE__