gpu_info_rocm.c 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #ifndef __APPLE__
  2. #include "gpu_info_rocm.h"
  3. #include <string.h>
  4. void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
  5. rsmi_status_t ret;
  6. resp->err = NULL;
  7. const int buflen = 256;
  8. char buf[buflen + 1];
  9. int i;
  10. struct lookup {
  11. char *s;
  12. void **p;
  13. } l[] = {
  14. {"rsmi_init", (void *)&resp->rh.rsmi_init},
  15. {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
  16. {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
  17. {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
  18. {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
  19. {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
  20. {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
  21. {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
  22. {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
  23. {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
  24. {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
  25. {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
  26. {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
  27. {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
  28. {NULL, NULL},
  29. };
  30. resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
  31. if (!resp->rh.handle) {
  32. char *msg = LOAD_ERR();
  33. snprintf(buf, buflen,
  34. "Unable to load %s library to query for Radeon GPUs: %s\n",
  35. rocm_lib_path, msg);
  36. free(msg);
  37. resp->err = strdup(buf);
  38. return;
  39. }
  40. // TODO once we've squashed the remaining corner cases remove this log
  41. LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
  42. for (i = 0; l[i].s != NULL; i++) {
  43. // TODO once we've squashed the remaining corner cases remove this log
  44. LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
  45. *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
  46. if (!l[i].p) {
  47. resp->rh.handle = NULL;
  48. char *msg = LOAD_ERR();
  49. LOG(resp->rh.verbose, "dlerr: %s\n", msg);
  50. UNLOAD_LIBRARY(resp->rh.handle);
  51. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  52. msg);
  53. free(msg);
  54. resp->err = strdup(buf);
  55. return;
  56. }
  57. }
  58. ret = (*resp->rh.rsmi_init)(0);
  59. if (ret != RSMI_STATUS_SUCCESS) {
  60. LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
  61. UNLOAD_LIBRARY(resp->rh.handle);
  62. resp->rh.handle = NULL;
  63. snprintf(buf, buflen, "rocm vram init failure: %d", ret);
  64. resp->err = strdup(buf);
  65. }
  66. return;
  67. }
  68. void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  69. resp->err = NULL;
  70. resp->igpu_index = -1;
  71. uint64_t totalMem = 0;
  72. uint64_t usedMem = 0;
  73. rsmi_status_t ret;
  74. const int buflen = 256;
  75. char buf[buflen + 1];
  76. int i;
  77. if (h.handle == NULL) {
  78. resp->err = strdup("rocm handle not initialized");
  79. return;
  80. }
  81. ret = (*h.rsmi_num_monitor_devices)(&resp->count);
  82. if (ret != RSMI_STATUS_SUCCESS) {
  83. snprintf(buf, buflen, "unable to get device count: %d", ret);
  84. resp->err = strdup(buf);
  85. return;
  86. }
  87. LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
  88. resp->total = 0;
  89. resp->free = 0;
  90. for (i = 0; i < resp->count; i++) {
  91. if (h.verbose) {
  92. // When in verbose mode, report more information about
  93. // the card we discover, but don't fail on error
  94. ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
  95. if (ret != RSMI_STATUS_SUCCESS) {
  96. LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
  97. } else {
  98. LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
  99. }
  100. ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
  101. if (ret != RSMI_STATUS_SUCCESS) {
  102. LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
  103. } else {
  104. LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
  105. }
  106. ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
  107. if (ret != RSMI_STATUS_SUCCESS) {
  108. LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
  109. } else {
  110. LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
  111. }
  112. ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
  113. if (ret != RSMI_STATUS_SUCCESS) {
  114. LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
  115. } else {
  116. LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
  117. }
  118. ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
  119. if (ret != RSMI_STATUS_SUCCESS) {
  120. LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
  121. } else {
  122. LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
  123. }
  124. ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
  125. if (ret != RSMI_STATUS_SUCCESS) {
  126. LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
  127. } else {
  128. LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
  129. }
  130. ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
  131. if (ret != RSMI_STATUS_SUCCESS) {
  132. LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
  133. } else {
  134. LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
  135. }
  136. }
  137. // Get total memory - used memory for available memory
  138. ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
  139. if (ret != RSMI_STATUS_SUCCESS) {
  140. snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
  141. resp->err = strdup(buf);
  142. return;
  143. }
  144. ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
  145. if (ret != RSMI_STATUS_SUCCESS) {
  146. snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
  147. resp->err = strdup(buf);
  148. return;
  149. }
  150. LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
  151. LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
  152. if (totalMem < 1024 * 1024 * 1024) {
  153. // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
  154. LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
  155. resp->igpu_index = i;
  156. } else {
  157. resp->total += totalMem;
  158. resp->free += totalMem - usedMem;
  159. }
  160. }
  161. }
  162. void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  163. const int buflen = 256;
  164. char buf[buflen + 1];
  165. if (h.handle == NULL) {
  166. resp->str = strdup("rocm handle not initialized");
  167. resp->status = 1;
  168. return;
  169. }
  170. rsmi_version_t ver;
  171. rsmi_status_t ret;
  172. ret = h.rsmi_version_get(&ver);
  173. if (ret != RSMI_STATUS_SUCCESS) {
  174. snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
  175. resp->status = 1;
  176. } else {
  177. snprintf(buf, buflen, "%d", ver.major);
  178. resp->status = 0;
  179. }
  180. resp->str = strdup(buf);
  181. }
  182. #endif // __APPLE__