Browse Source

Report more information about GPUs in verbose mode

This adds additional calls to both CUDA and ROCm management libraries to
discover additional attributes about the GPU(s) detected in the system, and
wires up runtime verbosity selection.  When users hit problems with GPUs we can
ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results.
Daniel Hiltgen 1 year ago
parent
commit
987c16b2f7
6 changed files with 171 additions and 30 deletions
  1. 9 0
      gpu/gpu.go
  2. 7 0
      gpu/gpu_info.h
  3. 52 2
      gpu/gpu_info_cuda.c
  4. 12 0
      gpu/gpu_info_cuda.h
  5. 81 27
      gpu/gpu_info_rocm.c
  6. 10 1
      gpu/gpu_info_rocm.h

+ 9 - 0
gpu/gpu.go

@@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 
 
 func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 	var resp C.cuda_init_resp_t
 	var resp C.cuda_init_resp_t
+	resp.ch.verbose = getVerboseState()
 	for _, libPath := range cudaLibPaths {
 	for _, libPath := range cudaLibPaths {
 		lib := C.CString(libPath)
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		defer C.free(unsafe.Pointer(lib))
@@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 
 
 func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 	var resp C.rocm_init_resp_t
 	var resp C.rocm_init_resp_t
+	resp.rh.verbose = getVerboseState()
 	for _, libPath := range rocmLibPaths {
 	for _, libPath := range rocmLibPaths {
 		lib := C.CString(libPath)
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		defer C.free(unsafe.Pointer(lib))
@@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 	}
 	}
 	return nil
 	return nil
 }
 }
+
+func getVerboseState() C.uint16_t {
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		return C.uint16_t(1)
+	}
+	return C.uint16_t(0)
+}

+ 7 - 0
gpu/gpu_info.h

@@ -27,6 +27,13 @@
 
 
 #endif
 #endif
 
 
+#define LOG(verbose, ...) \
+  do { \
+    if (verbose) { \
+      fprintf(stderr, __VA_ARGS__); \
+    } \
+  } while (0)
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C" {
 extern "C" {
 #endif
 #endif

+ 52 - 2
gpu/gpu_info_cuda.c

@@ -4,7 +4,7 @@
 
 
 #include <string.h>
 #include <string.h>
 
 
-#define CUDA_LOOKUP_SIZE 6
+#define CUDA_LOOKUP_SIZE 12
 
 
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
   nvmlReturn_t ret;
@@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
       {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
       {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
+      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
+      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
+      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
+      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
+      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
+      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
   };
   };
 
 
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
@@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     resp->err = strdup(buf);
     resp->err = strdup(buf);
   }
   }
 
 
-  return;
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
+  } else {
+    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
+  }
 }
 }
 
 
 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
@@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
       resp->err = strdup(buf);
       resp->err = strdup(buf);
       return;
       return;
     }
     }
+    if (h.verbose) {
+      nvmlBrandType_t brand = 0;
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
+      }
+    }
+
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
 
 
     resp->total += memInfo.total;
     resp->total += memInfo.total;
     resp->free += memInfo.free;
     resp->free += memInfo.free;

+ 12 - 0
gpu/gpu_info_cuda.h

@@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
   unsigned long long used;
   unsigned long long used;
 } nvmlMemory_t;
 } nvmlMemory_t;
 
 
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
 typedef struct cuda_handle {
 typedef struct cuda_handle {
   void *handle;
   void *handle;
+  uint16_t verbose;
   nvmlReturn_t (*initFn)(void);
   nvmlReturn_t (*initFn)(void);
   nvmlReturn_t (*shutdownFn)(void);
   nvmlReturn_t (*shutdownFn)(void);
   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
   nvmlReturn_t (*getCount)(unsigned int *);
   nvmlReturn_t (*getCount)(unsigned int *);
   nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
   nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
 } cuda_handle_t;
 } cuda_handle_t;
 
 
 typedef struct cuda_init_resp {
 typedef struct cuda_init_resp {

+ 81 - 27
gpu/gpu_info_rocm.c

@@ -4,7 +4,7 @@
 
 
 #include <string.h>
 #include <string.h>
 
 
-#define ROCM_LOOKUP_SIZE 5
+#define ROCM_LOOKUP_SIZE 14
 
 
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
   rsmi_status_t ret;
@@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
       {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
       {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
-      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
   };
   };
 
 
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
 
 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
   resp->err = NULL;
   resp->err = NULL;
-  // uint32_t num_devices;
-  // uint16_t device;
   uint64_t totalMem = 0;
   uint64_t totalMem = 0;
   uint64_t usedMem = 0;
   uint64_t usedMem = 0;
   rsmi_status_t ret;
   rsmi_status_t ret;
@@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
     return;
     return;
   }
   }
 
 
-  // TODO - iterate through devices...  ret =
-  // rsmi_num_monitor_devices(&num_devices);
-
-  // ret = (*h.getHandle)(0, &device);
-  // if (ret != RSMI_STATUS_SUCCESS) {
-  //     printf("rocm vram device lookup failure: %d\n", ret);
-  //     return -1;
-  // }
-
-  // Get total memory - used memory for available memory
-  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
   if (ret != RSMI_STATUS_SUCCESS) {
   if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
     resp->err = strdup(buf);
     return;
     return;
   }
   }
+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    if (h.verbose) {
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
+      }
+    }
 
 
-  // TODO: set this to the actual number of devices
-  resp->count = 1;
-  resp->total = totalMem;
-  resp->free = totalMem - usedMem;
-  return;
+    // Get total memory - used memory for available memory
+    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
+    resp->total += totalMem;
+    resp->free += totalMem - usedMem;
+  }
 }
 }
 
 
 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {

+ 10 - 1
gpu/gpu_info_rocm.h

@@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
 
 
 typedef struct rocm_handle {
 typedef struct rocm_handle {
   void *handle;
   void *handle;
+  uint16_t verbose;
   rsmi_status_t (*initFn)(uint64_t);
   rsmi_status_t (*initFn)(uint64_t);
   rsmi_status_t (*shutdownFn)(void);
   rsmi_status_t (*shutdownFn)(void);
   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
   rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
-  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
 } rocm_handle_t;
 } rocm_handle_t;
 
 
 typedef struct rocm_init_resp {
 typedef struct rocm_init_resp {