gpu_info_rocm.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. #ifndef __APPLE__
  2. #include "gpu_info_rocm.h"
  3. #include <string.h>
  4. #ifndef _WIN32
  5. const char *rocm_lib_paths[] = {
  6. "librocm_smi64.so",
  7. "/opt/rocm/lib/librocm_smi64.so",
  8. NULL,
  9. };
  10. #else
  11. // TODO untested
  12. const char *rocm_lib_paths[] = {
  13. "rocm_smi64.dll",
  14. "/opt/rocm/lib/rocm_smi64.dll",
  15. NULL,
  16. };
  17. #endif
  18. void rocm_init(rocm_init_resp_t *resp) {
  19. rsmi_status_t ret;
  20. resp->err = NULL;
  21. const int buflen = 256;
  22. char buf[buflen + 1];
  23. int i;
  24. struct lookup {
  25. char *s;
  26. void **p;
  27. } l[4] = {
  28. {"rsmi_init", (void *)&resp->rh.initFn},
  29. {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
  30. {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
  31. {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
  32. // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  33. };
  34. for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
  35. resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
  36. }
  37. if (!resp->rh.handle) {
  38. char *msg = LOAD_ERR();
  39. snprintf(buf, buflen,
  40. "Unable to load %s library to query for Radeon GPUs: %s\n",
  41. rocm_lib_paths[0], msg);
  42. free(msg);
  43. resp->err = strdup(buf);
  44. return;
  45. }
  46. for (i = 0; i < 4; i++) {
  47. *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
  48. if (!l[i].p) {
  49. UNLOAD_LIBRARY(resp->rh.handle);
  50. char *msg = LOAD_ERR();
  51. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  52. msg);
  53. free(msg);
  54. resp->err = strdup(buf);
  55. return;
  56. }
  57. }
  58. ret = (*resp->rh.initFn)(0);
  59. if (ret != RSMI_STATUS_SUCCESS) {
  60. snprintf(buf, buflen, "rocm vram init failure: %d", ret);
  61. resp->err = strdup(buf);
  62. }
  63. return;
  64. }
  65. void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  66. resp->err = NULL;
  67. // uint32_t num_devices;
  68. // uint16_t device;
  69. uint64_t totalMem = 0;
  70. uint64_t usedMem = 0;
  71. rsmi_status_t ret;
  72. const int buflen = 256;
  73. char buf[buflen + 1];
  74. int i;
  75. if (h.handle == NULL) {
  76. resp->err = strdup("nvml handle sn't initialized");
  77. return;
  78. }
  79. // TODO - iterate through devices... ret =
  80. // rsmi_num_monitor_devices(&num_devices);
  81. // ret = (*h.getHandle)(0, &device);
  82. // if (ret != RSMI_STATUS_SUCCESS) {
  83. // printf("rocm vram device lookup failure: %d\n", ret);
  84. // return -1;
  85. // }
  86. // Get total memory - used memory for available memory
  87. ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
  88. if (ret != RSMI_STATUS_SUCCESS) {
  89. snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
  90. resp->err = strdup(buf);
  91. return;
  92. }
  93. ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
  94. if (ret != RSMI_STATUS_SUCCESS) {
  95. snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
  96. resp->err = strdup(buf);
  97. return;
  98. }
  99. // TODO: set this to the actual number of devices
  100. resp->count = 1;
  101. resp->total = totalMem;
  102. resp->free = totalMem - usedMem;
  103. return;
  104. }
  105. #endif // __APPLE__