gpu_info_nvml.c 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_nvml.h"
  4. void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  5. nvmlReturn_t ret;
  6. resp->err = NULL;
  7. const int buflen = 256;
  8. char buf[buflen + 1];
  9. int i;
  10. LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[] = {
  15. {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
  16. {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
  17. {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
  18. {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
  19. {NULL, NULL},
  20. };
  21. resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
  22. if (!resp->ch.handle) {
  23. char *msg = LOAD_ERR();
  24. LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
  25. snprintf(buf, buflen,
  26. "Unable to load %s library to query for Nvidia GPUs: %s",
  27. nvml_lib_path, msg);
  28. free(msg);
  29. resp->err = strdup(buf);
  30. return;
  31. }
  32. // TODO once we've squashed the remaining corner cases remove this log
  33. // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
  34. LOG(1, "XXX wiring functions nvml_init\n");
  35. for (i = 0; l[i].s != NULL; i++) {
  36. // TODO once we've squashed the remaining corner cases remove this log
  37. LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
  38. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  39. if (!l[i].p) {
  40. resp->ch.handle = NULL;
  41. char *msg = LOAD_ERR();
  42. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  43. UNLOAD_LIBRARY(resp->ch.handle);
  44. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  45. msg);
  46. free(msg);
  47. resp->err = strdup(buf);
  48. return;
  49. }
  50. }
  51. LOG(1, "XXX calling init_v2\n");
  52. ret = (*resp->ch.nvmlInit_v2)();
  53. if (ret != NVML_SUCCESS) {
  54. LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
  55. UNLOAD_LIBRARY(resp->ch.handle);
  56. resp->ch.handle = NULL;
  57. snprintf(buf, buflen, "nvml vram init failure: %d", ret);
  58. resp->err = strdup(buf);
  59. return;
  60. }
  61. LOG(1, "XXX nvml_init done\n");
  62. }
  63. void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
  64. nvmlDevice_t device;
  65. nvmlMemory_t memInfo = {0};
  66. nvmlReturn_t ret;
  67. LOG(1, "XXX in nvml_get_free\n");
  68. ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
  69. if (ret != NVML_SUCCESS) {
  70. LOG(1, "unable to get device handle %d: %d", device_id, ret);
  71. *free = 0;
  72. return;
  73. }
  74. ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
  75. if (ret != NVML_SUCCESS) {
  76. LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
  77. *free = 0;
  78. return;
  79. }
  80. *free = memInfo.free;
  81. *total = memInfo.total;
  82. *used = memInfo.used;
  83. }
  84. void nvml_release(nvml_handle_t h) {
  85. LOG(h.verbose, "releasing nvml library\n");
  86. nvmlReturn_t ret;
  87. ret = (*h.nvmlShutdown)();
  88. if (ret != NVML_SUCCESS) {
  89. LOG(1, "error during nvmlShutdown %d", ret);
  90. }
  91. UNLOAD_LIBRARY(h.handle);
  92. h.handle = NULL;
  93. }
  94. #endif // __APPLE__