gpu_info_nvcuda.c 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_nvcuda.h"
  4. void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  5. CUresult ret;
  6. resp->err = NULL;
  7. resp->num_devices = 0;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[] = {
  15. {"cuInit", (void *)&resp->ch.cuInit},
  16. {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
  17. {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
  18. {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
  19. {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
  20. {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
  21. {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
  22. {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
  23. {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
  24. {NULL, NULL},
  25. };
  26. resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  27. if (!resp->ch.handle) {
  28. char *msg = LOAD_ERR();
  29. LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
  30. snprintf(buf, buflen,
  31. "Unable to load %s library to query for Nvidia GPUs: %s",
  32. nvcuda_lib_path, msg);
  33. free(msg);
  34. resp->err = strdup(buf);
  35. return;
  36. }
  37. for (i = 0; l[i].s != NULL; i++) {
  38. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  39. if (!*l[i].p) {
  40. char *msg = LOAD_ERR();
  41. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  42. UNLOAD_LIBRARY(resp->ch.handle);
  43. resp->ch.handle = NULL;
  44. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  45. msg);
  46. free(msg);
  47. resp->err = strdup(buf);
  48. return;
  49. }
  50. }
  51. ret = (*resp->ch.cuInit)(0);
  52. if (ret != CUDA_SUCCESS) {
  53. LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
  54. UNLOAD_LIBRARY(resp->ch.handle);
  55. resp->ch.handle = NULL;
  56. if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
  57. resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
  58. return;
  59. }
  60. snprintf(buf, buflen, "nvcuda init failure: %d", ret);
  61. resp->err = strdup(buf);
  62. return;
  63. }
  64. int version = 0;
  65. nvcudaDriverVersion_t driverVersion;
  66. driverVersion.major = 0;
  67. driverVersion.minor = 0;
  68. // Report driver version if we're in verbose mode, ignore errors
  69. ret = (*resp->ch.cuDriverGetVersion)(&version);
  70. if (ret != CUDA_SUCCESS) {
  71. LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  72. } else {
  73. driverVersion.major = version / 1000;
  74. driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
  75. LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  76. }
  77. ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  78. if (ret != CUDA_SUCCESS) {
  79. LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
  80. UNLOAD_LIBRARY(resp->ch.handle);
  81. resp->ch.handle = NULL;
  82. snprintf(buf, buflen, "unable to get device count: %d", ret);
  83. resp->err = strdup(buf);
  84. return;
  85. }
  86. }
  87. const int buflen = 256;
  88. void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
  89. resp->err = NULL;
  90. nvcudaMemory_t memInfo = {0,0};
  91. CUresult ret;
  92. CUdevice device = -1;
  93. CUcontext ctx = NULL;
  94. char buf[buflen + 1];
  95. CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  96. if (h.handle == NULL) {
  97. resp->err = strdup("nvcuda handle isn't initialized");
  98. return;
  99. }
  100. ret = (*h.cuDeviceGet)(&device, i);
  101. if (ret != CUDA_SUCCESS) {
  102. snprintf(buf, buflen, "nvcuda device failed to initialize");
  103. resp->err = strdup(buf);
  104. return;
  105. }
  106. resp->major = 0;
  107. resp->minor = 0;
  108. int major = 0;
  109. int minor = 0;
  110. ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  111. if (ret != CUDA_SUCCESS) {
  112. LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  113. } else {
  114. ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
  115. if (ret != CUDA_SUCCESS) {
  116. LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
  117. } else {
  118. resp->minor = minor;
  119. resp->major = major;
  120. }
  121. }
  122. ret = (*h.cuDeviceGetUuid)(&uuid, device);
  123. if (ret != CUDA_SUCCESS) {
  124. LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
  125. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  126. } else {
  127. // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
  128. snprintf(&resp->gpu_id[0], GPU_ID_LEN,
  129. "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
  130. uuid.bytes[0],
  131. uuid.bytes[1],
  132. uuid.bytes[2],
  133. uuid.bytes[3],
  134. uuid.bytes[4],
  135. uuid.bytes[5],
  136. uuid.bytes[6],
  137. uuid.bytes[7],
  138. uuid.bytes[8],
  139. uuid.bytes[9],
  140. uuid.bytes[10],
  141. uuid.bytes[11],
  142. uuid.bytes[12],
  143. uuid.bytes[13],
  144. uuid.bytes[14],
  145. uuid.bytes[15]
  146. );
  147. }
  148. // To get memory we have to set (and release) a context
  149. ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  150. if (ret != CUDA_SUCCESS) {
  151. snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
  152. resp->err = strdup(buf);
  153. return;
  154. }
  155. ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  156. if (ret != CUDA_SUCCESS) {
  157. snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
  158. resp->err = strdup(buf);
  159. // Best effort on failure...
  160. (*h.cuCtxDestroy)(ctx);
  161. return;
  162. }
  163. resp->total = memInfo.total;
  164. resp->free = memInfo.free;
  165. LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  166. LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  167. LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
  168. ret = (*h.cuCtxDestroy)(ctx);
  169. if (ret != CUDA_SUCCESS) {
  170. LOG(1, "nvcuda failed to release primary device context %d", ret);
  171. }
  172. }
  173. void nvcuda_release(nvcuda_handle_t h) {
  174. LOG(h.verbose, "releasing nvcuda library\n");
  175. UNLOAD_LIBRARY(h.handle);
  176. // TODO and other context release logic?
  177. h.handle = NULL;
  178. }
  179. #endif // __APPLE__