gpu_info_nvcuda.c 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?
  2. #include <string.h>
  3. #include "gpu_info_nvcuda.h"
  4. void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  5. CUresult ret;
  6. resp->err = NULL;
  7. resp->num_devices = 0;
  8. const int buflen = 256;
  9. char buf[buflen + 1];
  10. int i;
  11. struct lookup {
  12. char *s;
  13. void **p;
  14. } l[] = {
  15. {"cuInit", (void *)&resp->ch.cuInit},
  16. {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
  17. {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
  18. {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
  19. {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
  20. {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
  21. {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
  22. {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
  23. {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
  24. {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
  25. {NULL, NULL},
  26. };
  27. resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  28. if (!resp->ch.handle) {
  29. char *msg = LOAD_ERR();
  30. LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
  31. snprintf(buf, buflen,
  32. "Unable to load %s library to query for Nvidia GPUs: %s",
  33. nvcuda_lib_path, msg);
  34. free(msg);
  35. resp->err = strdup(buf);
  36. return;
  37. }
  38. for (i = 0; l[i].s != NULL; i++) {
  39. *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
  40. if (!*l[i].p) {
  41. char *msg = LOAD_ERR();
  42. LOG(resp->ch.verbose, "dlerr: %s\n", msg);
  43. UNLOAD_LIBRARY(resp->ch.handle);
  44. resp->ch.handle = NULL;
  45. snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
  46. msg);
  47. free(msg);
  48. resp->err = strdup(buf);
  49. return;
  50. }
  51. }
  52. ret = (*resp->ch.cuInit)(0);
  53. if (ret != CUDA_SUCCESS) {
  54. LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
  55. UNLOAD_LIBRARY(resp->ch.handle);
  56. resp->ch.handle = NULL;
  57. if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
  58. resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
  59. return;
  60. }
  61. snprintf(buf, buflen, "nvcuda init failure: %d", ret);
  62. resp->err = strdup(buf);
  63. return;
  64. }
  65. int version = 0;
  66. resp->ch.driver_major = 0;
  67. resp->ch.driver_minor = 0;
  68. // Report driver version if we're in verbose mode, ignore errors
  69. ret = (*resp->ch.cuDriverGetVersion)(&version);
  70. if (ret != CUDA_SUCCESS) {
  71. LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  72. } else {
  73. resp->ch.driver_major = version / 1000;
  74. resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
  75. LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
  76. }
  77. ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  78. if (ret != CUDA_SUCCESS) {
  79. LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
  80. UNLOAD_LIBRARY(resp->ch.handle);
  81. resp->ch.handle = NULL;
  82. snprintf(buf, buflen, "unable to get device count: %d", ret);
  83. resp->err = strdup(buf);
  84. return;
  85. }
  86. }
  87. const int buflen = 256;
  88. void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) {
  89. resp->err = NULL;
  90. nvcudaMemory_t memInfo = {0,0};
  91. CUresult ret;
  92. CUdevice device = -1;
  93. CUcontext ctx = NULL;
  94. char buf[buflen + 1];
  95. CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  96. if (h.handle == NULL) {
  97. resp->err = strdup("nvcuda handle isn't initialized");
  98. return;
  99. }
  100. ret = (*h.cuDeviceGet)(&device, i);
  101. if (ret != CUDA_SUCCESS) {
  102. snprintf(buf, buflen, "nvcuda device failed to initialize");
  103. resp->err = strdup(buf);
  104. return;
  105. }
  106. int major = 0;
  107. int minor = 0;
  108. ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  109. if (ret != CUDA_SUCCESS) {
  110. LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  111. } else {
  112. ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
  113. if (ret != CUDA_SUCCESS) {
  114. LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
  115. } else {
  116. resp->minor = minor;
  117. resp->major = major;
  118. }
  119. }
  120. ret = (*h.cuDeviceGetUuid)(&uuid, device);
  121. if (ret != CUDA_SUCCESS) {
  122. LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
  123. snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  124. } else {
  125. // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
  126. snprintf(&resp->gpu_id[0], GPU_ID_LEN,
  127. "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
  128. uuid.bytes[0],
  129. uuid.bytes[1],
  130. uuid.bytes[2],
  131. uuid.bytes[3],
  132. uuid.bytes[4],
  133. uuid.bytes[5],
  134. uuid.bytes[6],
  135. uuid.bytes[7],
  136. uuid.bytes[8],
  137. uuid.bytes[9],
  138. uuid.bytes[10],
  139. uuid.bytes[11],
  140. uuid.bytes[12],
  141. uuid.bytes[13],
  142. uuid.bytes[14],
  143. uuid.bytes[15]
  144. );
  145. }
  146. ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  147. if (ret != CUDA_SUCCESS) {
  148. LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
  149. resp->gpu_name[0] = '\0';
  150. }
  151. // To get memory we have to set (and release) a context
  152. ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  153. if (ret != CUDA_SUCCESS) {
  154. snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret);
  155. resp->err = strdup(buf);
  156. return;
  157. }
  158. ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  159. if (ret != CUDA_SUCCESS) {
  160. snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
  161. resp->err = strdup(buf);
  162. // Best effort on failure...
  163. (*h.cuCtxDestroy)(ctx);
  164. return;
  165. }
  166. resp->total = memInfo.total;
  167. resp->free = memInfo.free;
  168. LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  169. LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  170. LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
  171. ret = (*h.cuCtxDestroy)(ctx);
  172. if (ret != CUDA_SUCCESS) {
  173. LOG(1, "nvcuda failed to release primary device context %d", ret);
  174. }
  175. }
  176. void nvcuda_release(nvcuda_handle_t h) {
  177. LOG(h.verbose, "releasing nvcuda library\n");
  178. UNLOAD_LIBRARY(h.handle);
  179. // TODO and other context release logic?
  180. h.handle = NULL;
  181. }
  182. #endif // __APPLE__