gpu_info_nvcuda.c 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_nvcuda.h"
  4. void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  5. LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
  6. CUresult ret;
  7. resp->err = NULL;
  8. resp->num_devices = 0;
  9. resp->cudaErr = CUDA_SUCCESS;
  10. const int buflen = 256;
  11. char buf[buflen + 1];
  12. int i;
  13. struct lookup {
  14. char *s;
  15. void **p;
  16. } l[] = {
  17. {"cuInit", (void *)&resp->ch.cuInit},
  18. {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
  19. {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
  20. {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
  21. {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
  22. {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
  23. {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
  24. {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
  25. {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
  26. {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
  27. {NULL, NULL},
  28. };
  29. resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  30. if (!resp->ch.handle) {
  31. char *msg = LOAD_ERR();
  32. LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
  33. snprintf(buf, buflen,
  34. "Unable to load %s library to query for Nvidia GPUs: %s",
  35. nvcuda_lib_path, msg);
  36. free(msg);
  37. resp->err = strdup(buf);
  38. resp->cudaErr = -1;
  39. return;
  40. }
  41. for (i = 0; l[i].s != NULL; i++) {
  42. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  43. if (!*(l[i].p)) {
  44. char *msg = LOAD_ERR();
  45. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  46. UNLOAD_LIBRARY(resp->ch.handle);
  47. resp->ch.handle = NULL;
  48. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  49. msg);
  50. free(msg);
  51. resp->err = strdup(buf);
  52. resp->cudaErr = -1;
  53. return;
  54. }
  55. LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
  56. }
  57. LOG(resp->ch.verbose, "calling cuInit\n");
  58. ret = (*resp->ch.cuInit)(0);
  59. if (ret != CUDA_SUCCESS) {
  60. LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
  61. UNLOAD_LIBRARY(resp->ch.handle);
  62. resp->ch.handle = NULL;
  63. snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
  64. resp->err = strdup(buf);
  65. resp->cudaErr = ret;
  66. return;
  67. }
  68. int version = 0;
  69. resp->ch.driver_major = 0;
  70. resp->ch.driver_minor = 0;
  71. // Report driver version if we're in verbose mode, ignore errors
  72. LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
  73. ret = (*resp->ch.cuDriverGetVersion)(&version);
  74. if (ret != CUDA_SUCCESS) {
  75. LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  76. } else {
  77. LOG(resp->ch.verbose, "raw version 0x%x\n", version);
  78. resp->ch.driver_major = version / 1000;
  79. resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
  80. LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  81. }
  82. LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
  83. ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  84. if (ret != CUDA_SUCCESS) {
  85. LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
  86. UNLOAD_LIBRARY(resp->ch.handle);
  87. resp->ch.handle = NULL;
  88. snprintf(buf, buflen, "unable to get device count: %d", ret);
  89. resp->err = strdup(buf);
  90. resp->cudaErr = ret;
  91. return;
  92. }
  93. LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
  94. }
  95. const int buflen = 256;
  96. void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
  97. resp->err = NULL;
  98. nvcudaMemory_t memInfo = {0,0};
  99. CUresult ret;
  100. CUdevice device = -1;
  101. CUcontext ctx = NULL;
  102. char buf[buflen + 1];
  103. CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  104. if (h.handle == NULL) {
  105. resp->err = strdup("cuda driver library handle isn't initialized");
  106. return;
  107. }
  108. ret = (*h.cuDeviceGet)(&device, i);
  109. if (ret != CUDA_SUCCESS) {
  110. snprintf(buf, buflen, "cuda driver library device failed to initialize");
  111. resp->err = strdup(buf);
  112. return;
  113. }
  114. int major = 0;
  115. int minor = 0;
  116. ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  117. if (ret != CUDA_SUCCESS) {
  118. LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  119. } else {
  120. ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
  121. if (ret != CUDA_SUCCESS) {
  122. LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
  123. } else {
  124. resp->minor = minor;
  125. resp->major = major;
  126. }
  127. }
  128. ret = (*h.cuDeviceGetUuid)(&uuid, device);
  129. if (ret != CUDA_SUCCESS) {
  130. LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
  131. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  132. } else {
  133. // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
  134. snprintf(&resp->gpu_id[0], GPU_ID_LEN,
  135. "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
  136. uuid.bytes[0],
  137. uuid.bytes[1],
  138. uuid.bytes[2],
  139. uuid.bytes[3],
  140. uuid.bytes[4],
  141. uuid.bytes[5],
  142. uuid.bytes[6],
  143. uuid.bytes[7],
  144. uuid.bytes[8],
  145. uuid.bytes[9],
  146. uuid.bytes[10],
  147. uuid.bytes[11],
  148. uuid.bytes[12],
  149. uuid.bytes[13],
  150. uuid.bytes[14],
  151. uuid.bytes[15]
  152. );
  153. }
  154. ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  155. if (ret != CUDA_SUCCESS) {
  156. LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
  157. resp->gpu_name[0] = '\0';
  158. }
  159. // To get memory we have to set (and release) a context
  160. ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  161. if (ret != CUDA_SUCCESS) {
  162. snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
  163. resp->err = strdup(buf);
  164. return;
  165. }
  166. ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  167. if (ret != CUDA_SUCCESS) {
  168. snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
  169. resp->err = strdup(buf);
  170. // Best effort on failure...
  171. (*h.cuCtxDestroy)(ctx);
  172. return;
  173. }
  174. resp->total = memInfo.total;
  175. resp->free = memInfo.free;
  176. LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  177. LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  178. LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
  179. ret = (*h.cuCtxDestroy)(ctx);
  180. if (ret != CUDA_SUCCESS) {
  181. LOG(1, "cuda driver library failed to release device context %d", ret);
  182. }
  183. }
  184. void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
  185. CUresult ret;
  186. CUcontext ctx = NULL;
  187. CUdevice device = -1;
  188. *free = 0;
  189. *total = 0;
  190. ret = (*h.cuDeviceGet)(&device, i);
  191. if (ret != CUDA_SUCCESS) {
  192. LOG(1, "cuda driver library device failed to initialize");
  193. return;
  194. }
  195. // To get memory we have to set (and release) a context
  196. ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  197. if (ret != CUDA_SUCCESS) {
  198. LOG(1, "cuda driver library failed to get device context %d", ret);
  199. return;
  200. }
  201. ret = (*h.cuMemGetInfo_v2)(free, total);
  202. if (ret != CUDA_SUCCESS) {
  203. LOG(1, "cuda driver library device memory info lookup failure %d", ret);
  204. // Best effort on failure...
  205. (*h.cuCtxDestroy)(ctx);
  206. return;
  207. }
  208. ret = (*h.cuCtxDestroy)(ctx);
  209. if (ret != CUDA_SUCCESS) {
  210. LOG(1, "cuda driver library failed to release device context %d", ret);
  211. }
  212. }
  213. void nvcuda_release(nvcuda_handle_t h) {
  214. LOG(h.verbose, "releasing cuda driver library\n");
  215. UNLOAD_LIBRARY(h.handle);
  216. // TODO and other context release logic?
  217. h.handle = NULL;
  218. }
  219. #endif // __APPLE__