gpu_info_rocm.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #ifndef __APPLE__
  2. #include "gpu_info_rocm.h"
  3. #include <string.h>
  4. #define ROCM_LOOKUP_SIZE 14
  5. void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
  6. rsmi_status_t ret;
  7. resp->err = NULL;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[ROCM_LOOKUP_SIZE] = {
  15. {"rsmi_init", (void *)&resp->rh.initFn},
  16. {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
  17. {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
  18. {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
  19. {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
  20. {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
  21. {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
  22. {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
  23. {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
  24. {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
  25. {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
  26. {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
  27. {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
  28. {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
  29. };
  30. resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
  31. if (!resp->rh.handle) {
  32. char *msg = LOAD_ERR();
  33. snprintf(buf, buflen,
  34. "Unable to load %s library to query for Radeon GPUs: %s\n",
  35. rocm_lib_path, msg);
  36. free(msg);
  37. resp->err = strdup(buf);
  38. return;
  39. }
  40. for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
  41. *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
  42. if (!l[i].p) {
  43. UNLOAD_LIBRARY(resp->rh.handle);
  44. resp->rh.handle = NULL;
  45. char *msg = LOAD_ERR();
  46. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  47. msg);
  48. free(msg);
  49. resp->err = strdup(buf);
  50. return;
  51. }
  52. }
  53. ret = (*resp->rh.initFn)(0);
  54. if (ret != RSMI_STATUS_SUCCESS) {
  55. UNLOAD_LIBRARY(resp->rh.handle);
  56. resp->rh.handle = NULL;
  57. snprintf(buf, buflen, "rocm vram init failure: %d", ret);
  58. resp->err = strdup(buf);
  59. }
  60. return;
  61. }
  62. void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  63. resp->err = NULL;
  64. uint64_t totalMem = 0;
  65. uint64_t usedMem = 0;
  66. rsmi_status_t ret;
  67. const int buflen = 256;
  68. char buf[buflen + 1];
  69. int i;
  70. if (h.handle == NULL) {
  71. resp->err = strdup("rocm handle not initialized");
  72. return;
  73. }
  74. ret = (*h.rsmi_num_monitor_devices)(&resp->count);
  75. if (ret != RSMI_STATUS_SUCCESS) {
  76. snprintf(buf, buflen, "unable to get device count: %d", ret);
  77. resp->err = strdup(buf);
  78. return;
  79. }
  80. LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
  81. resp->total = 0;
  82. resp->free = 0;
  83. for (i = 0; i < resp->count; i++) {
  84. if (h.verbose) {
  85. // When in verbose mode, report more information about
  86. // the card we discover, but don't fail on error
  87. ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
  88. if (ret != RSMI_STATUS_SUCCESS) {
  89. LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
  90. } else {
  91. LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
  92. }
  93. ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
  94. if (ret != RSMI_STATUS_SUCCESS) {
  95. LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
  96. } else {
  97. LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
  98. }
  99. ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
  100. if (ret != RSMI_STATUS_SUCCESS) {
  101. LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
  102. } else {
  103. LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
  104. }
  105. ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
  106. if (ret != RSMI_STATUS_SUCCESS) {
  107. LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
  108. } else {
  109. LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
  110. }
  111. ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
  112. if (ret != RSMI_STATUS_SUCCESS) {
  113. LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
  114. } else {
  115. LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
  116. }
  117. ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
  118. if (ret != RSMI_STATUS_SUCCESS) {
  119. LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
  120. } else {
  121. LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
  122. }
  123. ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
  124. if (ret != RSMI_STATUS_SUCCESS) {
  125. LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
  126. } else {
  127. LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
  128. }
  129. }
  130. // Get total memory - used memory for available memory
  131. ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
  132. if (ret != RSMI_STATUS_SUCCESS) {
  133. snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
  134. resp->err = strdup(buf);
  135. return;
  136. }
  137. ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
  138. if (ret != RSMI_STATUS_SUCCESS) {
  139. snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
  140. resp->err = strdup(buf);
  141. return;
  142. }
  143. LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
  144. LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
  145. resp->total += totalMem;
  146. resp->free += totalMem - usedMem;
  147. }
  148. }
  149. void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  150. const int buflen = 256;
  151. char buf[buflen + 1];
  152. if (h.handle == NULL) {
  153. resp->str = strdup("nvml handle not initialized");
  154. resp->status = 1;
  155. return;
  156. }
  157. rsmi_version_t ver;
  158. rsmi_status_t ret;
  159. ret = h.versionGetFn(&ver);
  160. if (ret != RSMI_STATUS_SUCCESS) {
  161. snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
  162. resp->status = 1;
  163. } else {
  164. snprintf(buf, buflen, "%d", ver.major);
  165. resp->status = 0;
  166. }
  167. resp->str = strdup(buf);
  168. }
  169. #endif // __APPLE__