1 năm trước cách đây · f63dc2db5c
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 
				 
			
 
				 func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
			
 
				 	var resp C.cuda_init_resp_t
			
 
				+	resp.ch.verbose = getVerboseState()
			
 
				 	for _, libPath := range cudaLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
@@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 
				 
			
 
				 func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
			
 
				 	var resp C.rocm_init_resp_t
			
 
				+	resp.rh.verbose = getVerboseState()
			
 
				 	for _, libPath := range rocmLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
@@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 
				 	}
			
 
				 	return nil
			
 
				 }
			
 
				+
			
 
				+func getVerboseState() C.uint16_t {
			
 
				+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
			
 
				+		return C.uint16_t(1)
			
 
				+	}
			
 
				+	return C.uint16_t(0)
			
 
				+}
			
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -27,6 +27,13 @@
 
				 
			
 
				 #endif
			
 
				 
			
 
				+#define LOG(verbose, ...) \
			
 
				+  do { \
			
 
				+    if (verbose) { \
			
 
				+      fprintf(stderr, __VA_ARGS__); \
			
 
				+    } \
			
 
				+  } while (0)
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,7 +4,7 @@
 
				 
			
 
				 #include <string.h>
			
 
				 
			
 
				-#define CUDA_LOOKUP_SIZE 6
			
 
				+#define CUDA_LOOKUP_SIZE 12
			
 
				 
			
 
				 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
			
 
				   nvmlReturn_t ret;
			
@@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
			
 
				       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
			
 
				       {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
			
 
				+      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
			
 
				+      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
			
 
				+      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
			
 
				+      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
			
 
				+      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
			
 
				+      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
			
 
				   };
			
 
				 
			
 
				   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
			
@@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				     resp->err = strdup(buf);
			
 
				   }
			
 
				 
			
 
				-  return;
			
 
				+  // Report driver version if we're in verbose mode, ignore errors
			
 
				+  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
			
 
				+  if (ret != NVML_SUCCESS) {
			
 
				+    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
			
 
				+  } else {
			
 
				+    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
			
 
				+  }
			
 
				 }
			
 
				 
			
 
				 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
			
@@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 
				       resp->err = strdup(buf);
			
 
				       return;
			
 
				     }
			
 
				+    if (h.verbose) {
			
 
				+      nvmlBrandType_t brand = 0;
			
 
				+      // When in verbose mode, report more information about
			
 
				+      // the card we discover, but don't fail on error
			
 
				+      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
			
 
				+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
			
 
				 
			
 
				     resp->total += memInfo.total;
			
 
				     resp->free += memInfo.free;
			
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
 
				   unsigned long long used;
			
 
				 } nvmlMemory_t;
			
 
				 
			
 
				+typedef enum nvmlBrandType_enum
			
 
				+{
			
 
				+    NVML_BRAND_UNKNOWN          = 0,
			
 
				+} nvmlBrandType_t;
			
 
				+
			
 
				 typedef struct cuda_handle {
			
 
				   void *handle;
			
 
				+  uint16_t verbose;
			
 
				   nvmlReturn_t (*initFn)(void);
			
 
				   nvmlReturn_t (*shutdownFn)(void);
			
 
				   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
			
 
				   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
			
 
				   nvmlReturn_t (*getCount)(unsigned int *);
			
 
				   nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
			
 
				+  nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
			
 
				 } cuda_handle_t;
			
 
				 
			
 
				 typedef struct cuda_init_resp {
			
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,7 +4,7 @@
 
				 
			
 
				 #include <string.h>
			
 
				 
			
 
				-#define ROCM_LOOKUP_SIZE 5
			
 
				+#define ROCM_LOOKUP_SIZE 14
			
 
				 
			
 
				 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
			
 
				   rsmi_status_t ret;
			
@@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
				       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
			
 
				       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
			
 
				       {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
			
 
				-      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
			
 
				+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
			
 
				+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
			
 
				+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
			
 
				+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
			
 
				+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
			
 
				+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
			
 
				+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
			
 
				+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
			
 
				+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
			
 
				   };
			
 
				 
			
 
				   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
			
@@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
				 
			
 
				 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
			
 
				   resp->err = NULL;
			
 
				-  // uint32_t num_devices;
			
 
				-  // uint16_t device;
			
 
				   uint64_t totalMem = 0;
			
 
				   uint64_t usedMem = 0;
			
 
				   rsmi_status_t ret;
			
@@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  // TODO - iterate through devices...  ret =
			
 
				-  // rsmi_num_monitor_devices(&num_devices);
			
 
				-
			
 
				-  // ret = (*h.getHandle)(0, &device);
			
 
				-  // if (ret != RSMI_STATUS_SUCCESS) {
			
 
				-  //     printf("rocm vram device lookup failure: %d\n", ret);
			
 
				-  //     return -1;
			
 
				-  // }
			
 
				-
			
 
				-  // Get total memory - used memory for available memory
			
 
				-  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
			
 
				-  if (ret != RSMI_STATUS_SUCCESS) {
			
 
				-    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
			
 
				-    resp->err = strdup(buf);
			
 
				-    return;
			
 
				-  }
			
 
				-  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
			
 
				+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
			
 
				   if (ret != RSMI_STATUS_SUCCESS) {
			
 
				-    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
			
 
				+    snprintf(buf, buflen, "unable to get device count: %d", ret);
			
 
				     resp->err = strdup(buf);
			
 
				     return;
			
 
				   }
			
 
				+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
			
 
				+
			
 
				+  resp->total = 0;
			
 
				+  resp->free = 0;
			
 
				+  for (i = 0; i < resp->count; i++) {
			
 
				+    if (h.verbose) {
			
 
				+      // When in verbose mode, report more information about
			
 
				+      // the card we discover, but don't fail on error
			
 
				+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
			
 
				+      }
			
 
				+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
			
 
				+      if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
			
 
				+      } else {
			
 
				+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
			
 
				+      }
			
 
				+    }
			
 
				 
			
 
				-  // TODO: set this to the actual number of devices
			
 
				-  resp->count = 1;
			
 
				-  resp->total = totalMem;
			
 
				-  resp->free = totalMem - usedMem;
			
 
				-  return;
			
 
				+    // Get total memory - used memory for available memory
			
 
				+    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
			
 
				+    if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
			
 
				+    if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
			
 
				+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
			
 
				+    resp->total += totalMem;
			
 
				+    resp->free += totalMem - usedMem;
			
 
				+  }
			
 
				 }
			
 
				 
			
 
				 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
			
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
 
				 
			
 
				 typedef struct rocm_handle {
			
 
				   void *handle;
			
 
				+  uint16_t verbose;
			
 
				   rsmi_status_t (*initFn)(uint64_t);
			
 
				   rsmi_status_t (*shutdownFn)(void);
			
 
				   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
			
 
				   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
			
 
				   rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
			
 
				-  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
			
 
				+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
			
 
				+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
			
 
				+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
			
 
				+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
			
 
				+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
			
 
				+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
			
 
				+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
			
 
				+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
			
 
				+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
			
 
				 } rocm_handle_t;
			
 
				 
			
 
				 typedef struct rocm_init_resp {