gpu_info_rocm.c 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #ifndef __APPLE__
  2. #include "gpu_info_rocm.h"
  3. #include <string.h>
  4. #define ROCM_LOOKUP_SIZE 5
  5. void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
  6. rsmi_status_t ret;
  7. resp->err = NULL;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[ROCM_LOOKUP_SIZE] = {
  15. {"rsmi_init", (void *)&resp->rh.initFn},
  16. {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
  17. {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
  18. {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
  19. {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
  20. // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  21. };
  22. resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
  23. if (!resp->rh.handle) {
  24. char *msg = LOAD_ERR();
  25. snprintf(buf, buflen,
  26. "Unable to load %s library to query for Radeon GPUs: %s\n",
  27. rocm_lib_path, msg);
  28. free(msg);
  29. resp->err = strdup(buf);
  30. return;
  31. }
  32. for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
  33. *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
  34. if (!l[i].p) {
  35. UNLOAD_LIBRARY(resp->rh.handle);
  36. resp->rh.handle = NULL;
  37. char *msg = LOAD_ERR();
  38. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  39. msg);
  40. free(msg);
  41. resp->err = strdup(buf);
  42. return;
  43. }
  44. }
  45. ret = (*resp->rh.initFn)(0);
  46. if (ret != RSMI_STATUS_SUCCESS) {
  47. UNLOAD_LIBRARY(resp->rh.handle);
  48. resp->rh.handle = NULL;
  49. snprintf(buf, buflen, "rocm vram init failure: %d", ret);
  50. resp->err = strdup(buf);
  51. }
  52. return;
  53. }
  54. void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  55. resp->err = NULL;
  56. // uint32_t num_devices;
  57. // uint16_t device;
  58. uint64_t totalMem = 0;
  59. uint64_t usedMem = 0;
  60. rsmi_status_t ret;
  61. const int buflen = 256;
  62. char buf[buflen + 1];
  63. int i;
  64. if (h.handle == NULL) {
  65. resp->err = strdup("rocm handle not initialized");
  66. return;
  67. }
  68. // TODO - iterate through devices... ret =
  69. // rsmi_num_monitor_devices(&num_devices);
  70. // ret = (*h.getHandle)(0, &device);
  71. // if (ret != RSMI_STATUS_SUCCESS) {
  72. // printf("rocm vram device lookup failure: %d\n", ret);
  73. // return -1;
  74. // }
  75. // Get total memory - used memory for available memory
  76. ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
  77. if (ret != RSMI_STATUS_SUCCESS) {
  78. snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
  79. resp->err = strdup(buf);
  80. return;
  81. }
  82. ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
  83. if (ret != RSMI_STATUS_SUCCESS) {
  84. snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
  85. resp->err = strdup(buf);
  86. return;
  87. }
  88. // TODO: set this to the actual number of devices
  89. resp->count = 1;
  90. resp->total = totalMem;
  91. resp->free = totalMem - usedMem;
  92. return;
  93. }
  94. void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  95. const int buflen = 256;
  96. char buf[buflen + 1];
  97. if (h.handle == NULL) {
  98. resp->str = strdup("nvml handle not initialized");
  99. resp->status = 1;
  100. return;
  101. }
  102. rsmi_version_t ver;
  103. rsmi_status_t ret;
  104. ret = h.versionGetFn(&ver);
  105. if (ret != RSMI_STATUS_SUCCESS) {
  106. snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
  107. resp->status = 1;
  108. } else {
  109. snprintf(buf, buflen, "%d", ver.major);
  110. resp->status = 0;
  111. }
  112. resp->str = strdup(buf);
  113. }
  114. #endif // __APPLE__