123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
- #include "gpu_info_cuda.h"
- #include <string.h>
- void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
- nvmlReturn_t ret;
- resp->err = NULL;
- const int buflen = 256;
- char buf[buflen + 1];
- int i;
- struct lookup {
- char *s;
- void **p;
- } l[] = {
- {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
- {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
- {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
- {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
- {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
- {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
- {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
- {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
- {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
- {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
- {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
- {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
- {NULL, NULL},
- };
- resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
- if (!resp->ch.handle) {
- char *msg = LOAD_ERR();
- LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
- snprintf(buf, buflen,
- "Unable to load %s library to query for Nvidia GPUs: %s",
- cuda_lib_path, msg);
- free(msg);
- resp->err = strdup(buf);
- return;
- }
- // TODO once we've squashed the remaining corner cases remove this log
- LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
-
- for (i = 0; l[i].s != NULL; i++) {
- // TODO once we've squashed the remaining corner cases remove this log
- LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
- *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
- if (!l[i].p) {
- resp->ch.handle = NULL;
- char *msg = LOAD_ERR();
- LOG(resp->ch.verbose, "dlerr: %s\n", msg);
- UNLOAD_LIBRARY(resp->ch.handle);
- snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
- msg);
- free(msg);
- resp->err = strdup(buf);
- return;
- }
- }
- ret = (*resp->ch.nvmlInit_v2)();
- if (ret != NVML_SUCCESS) {
- LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
- UNLOAD_LIBRARY(resp->ch.handle);
- resp->ch.handle = NULL;
- snprintf(buf, buflen, "nvml vram init failure: %d", ret);
- resp->err = strdup(buf);
- return;
- }
- // Report driver version if we're in verbose mode, ignore errors
- ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
- if (ret != NVML_SUCCESS) {
- LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
- } else {
- LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
- }
- }
- void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
- resp->err = NULL;
- nvmlDevice_t device;
- nvmlMemory_t memInfo = {0};
- nvmlReturn_t ret;
- const int buflen = 256;
- char buf[buflen + 1];
- int i;
- if (h.handle == NULL) {
- resp->err = strdup("nvml handle sn't initialized");
- return;
- }
- ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
- if (ret != NVML_SUCCESS) {
- snprintf(buf, buflen, "unable to get device count: %d", ret);
- resp->err = strdup(buf);
- return;
- }
- resp->total = 0;
- resp->free = 0;
- for (i = 0; i < resp->count; i++) {
- ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
- if (ret != NVML_SUCCESS) {
- snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
- resp->err = strdup(buf);
- return;
- }
- ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
- if (ret != NVML_SUCCESS) {
- snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
- resp->err = strdup(buf);
- return;
- }
- if (h.verbose) {
- nvmlBrandType_t brand = 0;
- // When in verbose mode, report more information about
- // the card we discover, but don't fail on error
- ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
- if (ret != RSMI_STATUS_SUCCESS) {
- LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
- } else {
- LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
- }
- ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
- if (ret != RSMI_STATUS_SUCCESS) {
- LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
- } else {
- LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
- }
- ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
- if (ret != RSMI_STATUS_SUCCESS) {
- LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
- } else {
- LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
- }
- ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
- if (ret != RSMI_STATUS_SUCCESS) {
- LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
- } else {
- LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
- }
- ret = (*h.nvmlDeviceGetBrand)(device, &brand);
- if (ret != RSMI_STATUS_SUCCESS) {
- LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
- } else {
- LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
- }
- }
- LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
- LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
- resp->total += memInfo.total;
- resp->free += memInfo.free;
- }
- }
- void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
- resp->err = NULL;
- resp->major = 0;
- resp->minor = 0;
- nvmlDevice_t device;
- int major = 0;
- int minor = 0;
- nvmlReturn_t ret;
- const int buflen = 256;
- char buf[buflen + 1];
- int i;
- if (h.handle == NULL) {
- resp->err = strdup("nvml handle not initialized");
- return;
- }
- unsigned int devices;
- ret = (*h.nvmlDeviceGetCount_v2)(&devices);
- if (ret != NVML_SUCCESS) {
- snprintf(buf, buflen, "unable to get device count: %d", ret);
- resp->err = strdup(buf);
- return;
- }
- for (i = 0; i < devices; i++) {
- ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
- if (ret != NVML_SUCCESS) {
- snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
- resp->err = strdup(buf);
- return;
- }
- ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
- if (ret != NVML_SUCCESS) {
- snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
- resp->err = strdup(buf);
- return;
- }
- // Report the lowest major.minor we detect as that limits our compatibility
- if (resp->major == 0 || resp->major > major ) {
- resp->major = major;
- resp->minor = minor;
- } else if ( resp->major == major && resp->minor > minor ) {
- resp->minor = minor;
- }
- }
- }
- #endif // __APPLE__
|