1 year ago · 987c16b2f7
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 
															 func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
														
 
															 	var resp C.cuda_init_resp_t
														
 
															+	resp.ch.verbose = getVerboseState()
														
 
															 	for _, libPath := range cudaLibPaths {
														
 
															 		lib := C.CString(libPath)
														
 
															 		defer C.free(unsafe.Pointer(lib))
														
@@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 
															 func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
														
 
															 	var resp C.rocm_init_resp_t
														
 
															+	resp.rh.verbose = getVerboseState()
														
 
															 	for _, libPath := range rocmLibPaths {
														
 
															 		lib := C.CString(libPath)
														
 
															 		defer C.free(unsafe.Pointer(lib))
														
@@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 
															 	}
														
 
															 	return nil
														
 
															 }
														
 
															+
														
 
															+func getVerboseState() C.uint16_t {
														
 
															+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
														
 
															+		return C.uint16_t(1)
														
 
															+	}
														
 
															+	return C.uint16_t(0)
														
 
															+}
														
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -27,6 +27,13 @@
 
															 #endif
														
 
															+#define LOG(verbose, ...) \
														
 
															+  do { \
														
 
															+    if (verbose) { \
														
 
															+      fprintf(stderr, __VA_ARGS__); \
														
 
															+    } \
														
 
															+  } while (0)
														
 
															+
														
 
															 #ifdef __cplusplus
														
 
															 extern "C" {
														
 
															 #endif
														
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,7 +4,7 @@
 
															 #include <string.h>
														
 
															-#define CUDA_LOOKUP_SIZE 6
														
 
															+#define CUDA_LOOKUP_SIZE 12
														
 
															 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
														
 
															   nvmlReturn_t ret;
														
@@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
															       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
														
 
															       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
														
 
															       {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
														
 
															+      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
														
 
															+      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
														
 
															+      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
														
 
															+      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
														
 
															+      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
														
 
															+      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
														
 
															   };
														
 
															   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
														
@@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
															     resp->err = strdup(buf);
														
 
															   }
														
 
															-  return;
														
 
															+  // Report driver version if we're in verbose mode, ignore errors
														
 
															+  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
														
 
															+  if (ret != NVML_SUCCESS) {
														
 
															+    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
														
 
															+  } else {
														
 
															+    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
														
 
															+  }
														
 
															 }
														
 
															 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
														
@@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 
															       resp->err = strdup(buf);
														
 
															       return;
														
 
															     }
														
 
															+    if (h.verbose) {
														
 
															+      nvmlBrandType_t brand = 0;
														
 
															+      // When in verbose mode, report more information about
														
 
															+      // the card we discover, but don't fail on error
														
 
															+      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
														
 
															+      }
														
 
															+    }
														
 
															+
														
 
															+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
														
 
															+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
														
 
															     resp->total += memInfo.total;
														
 
															     resp->free += memInfo.free;
														
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
 
															   unsigned long long used;
														
 
															 } nvmlMemory_t;
														
 
															+typedef enum nvmlBrandType_enum
														
 
															+{
														
 
															+    NVML_BRAND_UNKNOWN          = 0,
														
 
															+} nvmlBrandType_t;
														
 
															+
														
 
															 typedef struct cuda_handle {
														
 
															   void *handle;
														
 
															+  uint16_t verbose;
														
 
															   nvmlReturn_t (*initFn)(void);
														
 
															   nvmlReturn_t (*shutdownFn)(void);
														
 
															   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
														
 
															   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
														
 
															   nvmlReturn_t (*getCount)(unsigned int *);
														
 
															   nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
														
 
															+  nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
														
 
															+  nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
														
 
															+  nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
														
 
															+  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
														
 
															+  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
														
 
															+  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
														
 
															 } cuda_handle_t;
														
 
															 typedef struct cuda_init_resp {
														
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,7 +4,7 @@
 
															 #include <string.h>
														
 
															-#define ROCM_LOOKUP_SIZE 5
														
 
															+#define ROCM_LOOKUP_SIZE 14
														
 
															 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
														
 
															   rsmi_status_t ret;
														
@@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
															       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
														
 
															       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
														
 
															       {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
														
 
															-      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
														
 
															+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
														
 
															+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
														
 
															+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
														
 
															+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
														
 
															+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
														
 
															+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
														
 
															+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
														
 
															+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
														
 
															+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
														
 
															   };
														
 
															   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
														
@@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
															 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
														
 
															   resp->err = NULL;
														
 
															-  // uint32_t num_devices;
														
 
															-  // uint16_t device;
														
 
															   uint64_t totalMem = 0;
														
 
															   uint64_t usedMem = 0;
														
 
															   rsmi_status_t ret;
														
@@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
 
															     return;
														
 
															   }
														
 
															-  // TODO - iterate through devices...  ret =
														
 
															-  // rsmi_num_monitor_devices(&num_devices);
														
 
															-
														
 
															-  // ret = (*h.getHandle)(0, &device);
														
 
															-  // if (ret != RSMI_STATUS_SUCCESS) {
														
 
															-  //     printf("rocm vram device lookup failure: %d\n", ret);
														
 
															-  //     return -1;
														
 
															-  // }
														
 
															-
														
 
															-  // Get total memory - used memory for available memory
														
 
															-  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
														
 
															-  if (ret != RSMI_STATUS_SUCCESS) {
														
 
															-    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
														
 
															-    resp->err = strdup(buf);
														
 
															-    return;
														
 
															-  }
														
 
															-  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
														
 
															+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
														
 
															   if (ret != RSMI_STATUS_SUCCESS) {
														
 
															-    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
														
 
															+    snprintf(buf, buflen, "unable to get device count: %d", ret);
														
 
															     resp->err = strdup(buf);
														
 
															     return;
														
 
															   }
														
 
															+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
														
 
															+
														
 
															+  resp->total = 0;
														
 
															+  resp->free = 0;
														
 
															+  for (i = 0; i < resp->count; i++) {
														
 
															+    if (h.verbose) {
														
 
															+      // When in verbose mode, report more information about
														
 
															+      // the card we discover, but don't fail on error
														
 
															+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
														
 
															+      }
														
 
															+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
														
 
															+      if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
														
 
															+      } else {
														
 
															+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
														
 
															+      }
														
 
															+    }
														
 
															-  // TODO: set this to the actual number of devices
														
 
															-  resp->count = 1;
														
 
															-  resp->total = totalMem;
														
 
															-  resp->free = totalMem - usedMem;
														
 
															-  return;
														
 
															+    // Get total memory - used memory for available memory
														
 
															+    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
														
 
															+    if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
														
 
															+      resp->err = strdup(buf);
														
 
															+      return;
														
 
															+    }
														
 
															+    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
														
 
															+    if (ret != RSMI_STATUS_SUCCESS) {
														
 
															+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
														
 
															+      resp->err = strdup(buf);
														
 
															+      return;
														
 
															+    }
														
 
															+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
														
 
															+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
														
 
															+    resp->total += totalMem;
														
 
															+    resp->free += totalMem - usedMem;
														
 
															+  }
														
 
															 }
														
 
															 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
														
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
 
															 typedef struct rocm_handle {
														
 
															   void *handle;
														
 
															+  uint16_t verbose;
														
 
															   rsmi_status_t (*initFn)(uint64_t);
														
 
															   rsmi_status_t (*shutdownFn)(void);
														
 
															   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
														
 
															   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
														
 
															   rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
														
 
															-  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
														
 
															+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
														
 
															+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
														
 
															+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
														
 
															+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
														
 
															+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
														
 
															+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
														
 
															+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
														
 
															+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
														
 
															+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
														
 
															 } rocm_handle_t;
														
 
															 typedef struct rocm_init_resp {