1 éve · dfc6721b20
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -23,7 +23,8 @@ import (
 
				 )
			
 
				 
			
 
				 type handles struct {
			
 
				-	cuda *C.cuda_handle_t
			
 
				+	nvml   *C.nvml_handle_t
			
 
				+	cudart *C.cudart_handle_t
			
 
				 }
			
 
				 
			
 
				 var gpuMutex sync.Mutex
			
@@ -33,7 +34,7 @@ var gpuHandles *handles = nil
 
				 var CudaComputeMin = [2]C.int{5, 0}
			
 
				 
			
 
				 // Possible locations for the nvidia-ml library
			
 
				-var CudaLinuxGlobs = []string{
			
 
				+var NvmlLinuxGlobs = []string{
			
 
				 	"/usr/local/cuda/lib64/libnvidia-ml.so*",
			
 
				 	"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so*",
			
 
				 	"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
			
@@ -41,49 +42,98 @@ var CudaLinuxGlobs = []string{
 
				 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
			
 
				 	"/opt/cuda/lib64/libnvidia-ml.so*",
			
 
				 	"/usr/lib*/libnvidia-ml.so*",
			
 
				-	"/usr/local/lib*/libnvidia-ml.so*",
			
 
				 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
			
 
				 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
			
 
				+	"/usr/local/lib*/libnvidia-ml.so*",
			
 
				 
			
 
				 	// TODO: are these stubs ever valid?
			
 
				 	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
			
 
				 }
			
 
				 
			
 
				-var CudaWindowsGlobs = []string{
			
 
				+var NvmlWindowsGlobs = []string{
			
 
				 	"c:\\Windows\\System32\\nvml.dll",
			
 
				 }
			
 
				 
			
 
				+var CudartLinuxGlobs = []string{
			
 
				+	"/usr/local/cuda/lib64/libcudart.so*",
			
 
				+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
			
 
				+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
			
 
				+	"/usr/lib/wsl/lib/libcudart.so*",
			
 
				+	"/usr/lib/wsl/drivers/*/libcudart.so*",
			
 
				+	"/opt/cuda/lib64/libcudart.so*",
			
 
				+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
			
 
				+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
			
 
				+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
			
 
				+	"/usr/local/cuda/lib*/libcudart.so*",
			
 
				+	"/usr/lib*/libcudart.so*",
			
 
				+	"/usr/local/lib*/libcudart.so*",
			
 
				+}
			
 
				+
			
 
				+var CudartWindowsGlobs = []string{
			
 
				+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
			
 
				+}
			
 
				+
			
 
				+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
			
 
				+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
			
 
				+var CudaTegra string = os.Getenv("JETSON_JETPACK")
			
 
				+
			
 
				 // Note: gpuMutex must already be held
			
 
				 func initGPUHandles() {
			
 
				 
			
 
				 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
			
 
				 
			
 
				-	gpuHandles = &handles{nil}
			
 
				-	var cudaMgmtName string
			
 
				-	var cudaMgmtPatterns []string
			
 
				+	gpuHandles = &handles{nil, nil}
			
 
				+	var nvmlMgmtName string
			
 
				+	var nvmlMgmtPatterns []string
			
 
				+	var cudartMgmtName string
			
 
				+	var cudartMgmtPatterns []string
			
 
				+
			
 
				+	tmpDir, _ := PayloadsDir()
			
 
				 	switch runtime.GOOS {
			
 
				 	case "windows":
			
 
				-		cudaMgmtName = "nvml.dll"
			
 
				-		cudaMgmtPatterns = make([]string, len(CudaWindowsGlobs))
			
 
				-		copy(cudaMgmtPatterns, CudaWindowsGlobs)
			
 
				+		nvmlMgmtName = "nvml.dll"
			
 
				+		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
			
 
				+		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
			
 
				+		cudartMgmtName = "cudart64_*.dll"
			
 
				+		localAppData := os.Getenv("LOCALAPPDATA")
			
 
				+		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
			
 
				+		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
			
 
				 	case "linux":
			
 
				-		cudaMgmtName = "libnvidia-ml.so"
			
 
				-		cudaMgmtPatterns = make([]string, len(CudaLinuxGlobs))
			
 
				-		copy(cudaMgmtPatterns, CudaLinuxGlobs)
			
 
				+		nvmlMgmtName = "libnvidia-ml.so"
			
 
				+		nvmlMgmtPatterns = make([]string, len(NvmlLinuxGlobs))
			
 
				+		copy(nvmlMgmtPatterns, NvmlLinuxGlobs)
			
 
				+		cudartMgmtName = "libcudart.so*"
			
 
				+		if tmpDir != "" {
			
 
				+			// TODO - add "payloads" for subprocess
			
 
				+			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
			
 
				+		}
			
 
				+		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
			
 
				 	default:
			
 
				 		return
			
 
				 	}
			
 
				 
			
 
				 	slog.Info("Detecting GPU type")
			
 
				-	cudaLibPaths := FindGPULibs(cudaMgmtName, cudaMgmtPatterns)
			
 
				-	if len(cudaLibPaths) > 0 {
			
 
				-		cuda := LoadCUDAMgmt(cudaLibPaths)
			
 
				-		if cuda != nil {
			
 
				-			slog.Info("Nvidia GPU detected")
			
 
				-			gpuHandles.cuda = cuda
			
 
				+	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
			
 
				+	if len(cudartLibPaths) > 0 {
			
 
				+		cudart := LoadCUDARTMgmt(cudartLibPaths)
			
 
				+		if cudart != nil {
			
 
				+			slog.Info("Nvidia GPU detected via cudart")
			
 
				+			gpuHandles.cudart = cudart
			
 
				 			return
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	// TODO once we build confidence, remove this and the gpu_info_nvml.[ch] files
			
 
				+	nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
			
 
				+	if len(nvmlLibPaths) > 0 {
			
 
				+		nvml := LoadNVMLMgmt(nvmlLibPaths)
			
 
				+		if nvml != nil {
			
 
				+			slog.Info("Nvidia GPU detected via nvidia-ml")
			
 
				+			gpuHandles.nvml = nvml
			
 
				+			return
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 }
			
 
				 
			
 
				 func GetGPUInfo() GpuInfo {
			
@@ -103,23 +153,42 @@ func GetGPUInfo() GpuInfo {
 
				 
			
 
				 	var memInfo C.mem_info_t
			
 
				 	resp := GpuInfo{}
			
 
				-	if gpuHandles.cuda != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
			
 
				-		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
			
 
				+	if gpuHandles.nvml != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
			
 
				+		C.nvml_check_vram(*gpuHandles.nvml, &memInfo)
			
 
				+		if memInfo.err != nil {
			
 
				+			slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU memory: %s", C.GoString(memInfo.err)))
			
 
				+			C.free(unsafe.Pointer(memInfo.err))
			
 
				+		} else if memInfo.count > 0 {
			
 
				+			// Verify minimum compute capability
			
 
				+			var cc C.nvml_compute_capability_t
			
 
				+			C.nvml_compute_capability(*gpuHandles.nvml, &cc)
			
 
				+			if cc.err != nil {
			
 
				+				slog.Info(fmt.Sprintf("[nvidia-ml] error looking up NVML GPU compute capability: %s", C.GoString(cc.err)))
			
 
				+				C.free(unsafe.Pointer(cc.err))
			
 
				+			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
			
 
				+				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				+				resp.Library = "cuda"
			
 
				+			} else {
			
 
				+				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				+			}
			
 
				+		}
			
 
				+	} else if gpuHandles.cudart != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
			
 
				+		C.cudart_check_vram(*gpuHandles.cudart, &memInfo)
			
 
				 		if memInfo.err != nil {
			
 
				-			slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err)))
			
 
				+			slog.Info(fmt.Sprintf("[cudart] error looking up CUDART GPU memory: %s", C.GoString(memInfo.err)))
			
 
				 			C.free(unsafe.Pointer(memInfo.err))
			
 
				 		} else if memInfo.count > 0 {
			
 
				 			// Verify minimum compute capability
			
 
				-			var cc C.cuda_compute_capability_t
			
 
				-			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
			
 
				+			var cc C.cudart_compute_capability_t
			
 
				+			C.cudart_compute_capability(*gpuHandles.cudart, &cc)
			
 
				 			if cc.err != nil {
			
 
				-				slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err)))
			
 
				+				slog.Info(fmt.Sprintf("[cudart] error looking up CUDA compute capability: %s", C.GoString(cc.err)))
			
 
				 				C.free(unsafe.Pointer(cc.err))
			
 
				 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
			
 
				-				slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				+				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				 				resp.Library = "cuda"
			
 
				 			} else {
			
 
				-				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				+				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				 			}
			
 
				 		}
			
 
				 	} else {
			
@@ -176,6 +245,11 @@ func CheckVRAM() (int64, error) {
 
				 		if overhead < gpus*1024*1024*1024 {
			
 
				 			overhead = gpus * 1024 * 1024 * 1024
			
 
				 		}
			
 
				+		// Assigning full reported free memory for Tegras due to OS controlled caching.
			
 
				+		if CudaTegra != "" {
			
 
				+			// Setting overhead for non-Tegra devices
			
 
				+			overhead = 0
			
 
				+		}
			
 
				 		avail := int64(gpuInfo.FreeMemory - overhead)
			
 
				 		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
			
 
				 		return avail, nil
			
@@ -238,15 +312,32 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 
				 	return gpuLibPaths
			
 
				 }
			
 
				 
			
 
				-func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
			
 
				-	var resp C.cuda_init_resp_t
			
 
				+func LoadNVMLMgmt(nvmlLibPaths []string) *C.nvml_handle_t {
			
 
				+	var resp C.nvml_init_resp_t
			
 
				+	resp.ch.verbose = getVerboseState()
			
 
				+	for _, libPath := range nvmlLibPaths {
			
 
				+		lib := C.CString(libPath)
			
 
				+		defer C.free(unsafe.Pointer(lib))
			
 
				+		C.nvml_init(lib, &resp)
			
 
				+		if resp.err != nil {
			
 
				+			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
			
 
				+			C.free(unsafe.Pointer(resp.err))
			
 
				+		} else {
			
 
				+			return &resp.ch
			
 
				+		}
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func LoadCUDARTMgmt(cudartLibPaths []string) *C.cudart_handle_t {
			
 
				+	var resp C.cudart_init_resp_t
			
 
				 	resp.ch.verbose = getVerboseState()
			
 
				-	for _, libPath := range cudaLibPaths {
			
 
				+	for _, libPath := range cudartLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
 
				-		C.cuda_init(lib, &resp)
			
 
				+		C.cudart_init(lib, &resp)
			
 
				 		if resp.err != nil {
			
 
				-			slog.Info(fmt.Sprintf("Unable to load CUDA management library %s: %s", libPath, C.GoString(resp.err)))
			
 
				+			slog.Info(fmt.Sprintf("Unable to load cudart CUDA management library %s: %s", libPath, C.GoString(resp.err)))
			
 
				 			C.free(unsafe.Pointer(resp.err))
			
 
				 		} else {
			
 
				 			return &resp.ch
			
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -52,7 +52,8 @@ void cpu_check_ram(mem_info_t *resp);
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#include "gpu_info_cuda.h"
			
 
				+#include "gpu_info_nvml.h"
			
 
				+#include "gpu_info_cudart.h"
			
 
				 
			
 
				 #endif  // __GPU_INFO_H__
			
 
				 #endif  // __APPLE__
			
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -0,0 +1,190 @@
 
				+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include "gpu_info_cudart.h"
			
 
				+
			
 
				+void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
			
 
				+  cudartReturn_t ret;
			
 
				+  resp->err = NULL;
			
 
				+  const int buflen = 256;
			
 
				+  char buf[buflen + 1];
			
 
				+  int i;
			
 
				+
			
 
				+  struct lookup {
			
 
				+    char *s;
			
 
				+    void **p;
			
 
				+  } l[] = {
			
 
				+      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
			
 
				+      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
			
 
				+      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
			
 
				+      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
			
 
				+      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
			
 
				+      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
			
 
				+      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
			
 
				+      {NULL, NULL},
			
 
				+  };
			
 
				+
			
 
				+  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
			
 
				+  if (!resp->ch.handle) {
			
 
				+    char *msg = LOAD_ERR();
			
 
				+    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
			
 
				+    snprintf(buf, buflen,
			
 
				+            "Unable to load %s library to query for Nvidia GPUs: %s",
			
 
				+            cudart_lib_path, msg);
			
 
				+    free(msg);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // TODO once we've squashed the remaining corner cases remove this log
			
 
				+  LOG(resp->ch.verbose, "wiring cudart library functions in %s\n", cudart_lib_path);
			
 
				+  
			
 
				+  for (i = 0; l[i].s != NULL; i++) {
			
 
				+    // TODO once we've squashed the remaining corner cases remove this log
			
 
				+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
			
 
				+
			
 
				+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
			
 
				+    if (!l[i].p) {
			
 
				+      char *msg = LOAD_ERR();
			
 
				+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
			
 
				+      UNLOAD_LIBRARY(resp->ch.handle);
			
 
				+      resp->ch.handle = NULL;
			
 
				+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
			
 
				+              msg);
			
 
				+      free(msg);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  ret = (*resp->ch.cudaSetDevice)(0);
			
 
				+  if (ret != CUDART_SUCCESS) {
			
 
				+    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
			
 
				+    UNLOAD_LIBRARY(resp->ch.handle);
			
 
				+    resp->ch.handle = NULL;
			
 
				+    snprintf(buf, buflen, "cudart init failure: %d", ret);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int version = 0;
			
 
				+  cudartDriverVersion_t driverVersion;
			
 
				+  driverVersion.major = 0;
			
 
				+  driverVersion.minor = 0;
			
 
				+
			
 
				+  // Report driver version if we're in verbose mode, ignore errors
			
 
				+  ret = (*resp->ch.cudaDriverGetVersion)(&version);
			
 
				+  if (ret != CUDART_SUCCESS) {
			
 
				+    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
			
 
				+  } else {
			
 
				+    driverVersion.major = version / 1000;
			
 
				+    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
			
 
				+    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void cudart_check_vram(cudart_handle_t h, mem_info_t *resp) {
			
 
				+  resp->err = NULL;
			
 
				+  cudartMemory_t memInfo = {0,0,0};
			
 
				+  cudartReturn_t ret;
			
 
				+  const int buflen = 256;
			
 
				+  char buf[buflen + 1];
			
 
				+  int i;
			
 
				+
			
 
				+  if (h.handle == NULL) {
			
 
				+    resp->err = strdup("cudart handle isn't initialized");
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // cudaGetDeviceCount takes int type, resp-> count is uint
			
 
				+  int deviceCount;
			
 
				+  ret = (*h.cudaGetDeviceCount)(&deviceCount);
			
 
				+  if (ret != CUDART_SUCCESS) {
			
 
				+    snprintf(buf, buflen, "unable to get device count: %d", ret);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  } else {
			
 
				+    resp->count = (unsigned int)deviceCount;
			
 
				+  }
			
 
				+
			
 
				+  resp->total = 0;
			
 
				+  resp->free = 0;
			
 
				+  for (i = 0; i < resp-> count; i++) {  
			
 
				+    ret = (*h.cudaSetDevice)(i);
			
 
				+    if (ret != CUDART_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "cudart device failed to initialize");
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+    ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
			
 
				+    if (ret != CUDART_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    LOG(h.verbose, "[%d] CUDA totalMem %lu\n", i, memInfo.total);
			
 
				+    LOG(h.verbose, "[%d] CUDA freeMem %lu\n", i, memInfo.free);
			
 
				+
			
 
				+    resp->total += memInfo.total;
			
 
				+    resp->free += memInfo.free;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *resp) {
			
 
				+  resp->err = NULL;
			
 
				+  resp->major = 0;
			
 
				+  resp->minor = 0;
			
 
				+  int major = 0;
			
 
				+  int minor = 0;
			
 
				+  cudartReturn_t ret;
			
 
				+  const int buflen = 256;
			
 
				+  char buf[buflen + 1];
			
 
				+  int i;
			
 
				+
			
 
				+  if (h.handle == NULL) {
			
 
				+    resp->err = strdup("cudart handle not initialized");
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  int devices;
			
 
				+  ret = (*h.cudaGetDeviceCount)(&devices);
			
 
				+  if (ret != CUDART_SUCCESS) {
			
 
				+    snprintf(buf, buflen, "unable to get cudart device count: %d", ret);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  for (i = 0; i < devices; i++) {
			
 
				+    ret = (*h.cudaSetDevice)(i);
			
 
				+    if (ret != CUDART_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "cudart device failed to initialize");
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    ret = (*h.cudaDeviceGetAttribute)(&major, cudartDevAttrComputeCapabilityMajor, i);
			
 
				+    if (ret != CUDART_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+    ret = (*h.cudaDeviceGetAttribute)(&minor, cudartDevAttrComputeCapabilityMinor, i);
			
 
				+    if (ret != CUDART_SUCCESS) {
			
 
				+      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+      
			
 
				+    // Report the lowest major.minor we detect as that limits our compatibility
			
 
				+    if (resp->major == 0 || resp->major > major ) {
			
 
				+      resp->major = major;
			
 
				+      resp->minor = minor;
			
 
				+    } else if ( resp->major == major && resp->minor > minor ) {
			
 
				+      resp->minor = minor;
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#endif  // __APPLE__
			
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@@ -0,0 +1,59 @@
 
				+#ifndef __APPLE__
			
 
				+#ifndef __GPU_INFO_CUDART_H__
			
 
				+#define __GPU_INFO_CUDART_H__
			
 
				+#include "gpu_info.h"
			
 
				+
			
 
				+// Just enough typedef's to dlopen/dlsym for memory information
			
 
				+typedef enum cudartReturn_enum {
			
 
				+  CUDART_SUCCESS = 0,
			
 
				+  CUDART_UNSUPPORTED = 1,
			
 
				+  // Other values omitted for now...
			
 
				+} cudartReturn_t;
			
 
				+
			
 
				+typedef enum cudartDeviceAttr_enum {
			
 
				+  cudartDevAttrComputeCapabilityMajor = 75,
			
 
				+  cudartDevAttrComputeCapabilityMinor = 76,
			
 
				+} cudartDeviceAttr_t;
			
 
				+
			
 
				+typedef void *cudartDevice_t;  // Opaque is sufficient
			
 
				+typedef struct cudartMemory_st {
			
 
				+  size_t total;
			
 
				+  size_t free;
			
 
				+  size_t used;
			
 
				+} cudartMemory_t;
			
 
				+
			
 
				+typedef struct cudartDriverVersion {
			
 
				+  int major;
			
 
				+  int minor;
			
 
				+} cudartDriverVersion_t;
			
 
				+
			
 
				+typedef struct cudart_handle {
			
 
				+  void *handle;
			
 
				+  uint16_t verbose;
			
 
				+  cudartReturn_t (*cudaSetDevice)(int device);
			
 
				+  cudartReturn_t (*cudaDeviceSynchronize)(void);
			
 
				+  cudartReturn_t (*cudaDeviceReset)(void);
			
 
				+  cudartReturn_t (*cudaMemGetInfo)(size_t *, size_t *);
			
 
				+  cudartReturn_t (*cudaGetDeviceCount)(int *);
			
 
				+  cudartReturn_t (*cudaDeviceGetAttribute)(int* value, cudartDeviceAttr_t attr, int device);
			
 
				+  cudartReturn_t (*cudaDriverGetVersion) (int *driverVersion);
			
 
				+} cudart_handle_t;
			
 
				+
			
 
				+typedef struct cudart_init_resp {
			
 
				+  char *err;  // If err is non-null handle is invalid
			
 
				+  cudart_handle_t ch;
			
 
				+} cudart_init_resp_t;
			
 
				+
			
 
				+typedef struct cudart_compute_capability {
			
 
				+  char *err;
			
 
				+  int major;
			
 
				+  int minor;
			
 
				+} cudart_compute_capability_t;
			
 
				+
			
 
				+
			
 
				+void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
			
 
				+void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
			
 
				+void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
			
 
				+
			
 
				+#endif  // __GPU_INFO_CUDART_H__
			
 
				+#endif  // __APPLE__
			
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -1,10 +1,10 @@
 
				 #ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
			
 
				 
			
 
				-#include "gpu_info_cuda.h"
			
 
				-
			
 
				 #include <string.h>
			
 
				 
			
 
				-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
			
 
				+#include "gpu_info_nvml.h"
			
 
				+
			
 
				+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
			
 
				   nvmlReturn_t ret;
			
 
				   resp->err = NULL;
			
 
				   const int buflen = 256;
			
@@ -30,20 +30,20 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				       {NULL, NULL},
			
 
				   };
			
 
				 
			
 
				-  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
			
 
				+  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
			
 
				   if (!resp->ch.handle) {
			
 
				     char *msg = LOAD_ERR();
			
 
				-    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
			
 
				+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
			
 
				     snprintf(buf, buflen,
			
 
				              "Unable to load %s library to query for Nvidia GPUs: %s",
			
 
				-             cuda_lib_path, msg);
			
 
				+             nvml_lib_path, msg);
			
 
				     free(msg);
			
 
				     resp->err = strdup(buf);
			
 
				     return;
			
 
				   }
			
 
				 
			
 
				   // TODO once we've squashed the remaining corner cases remove this log
			
 
				-  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
			
 
				+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
			
 
				   
			
 
				   for (i = 0; l[i].s != NULL; i++) {
			
 
				     // TODO once we've squashed the remaining corner cases remove this log
			
@@ -82,7 +82,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				   }
			
 
				 }
			
 
				 
			
 
				-void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
			
 
				+void nvml_check_vram(nvml_handle_t h, mem_info_t *resp) {
			
 
				   resp->err = NULL;
			
 
				   nvmlDevice_t device;
			
 
				   nvmlMemory_t memInfo = {0};
			
@@ -92,7 +92,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 
				   int i;
			
 
				 
			
 
				   if (h.handle == NULL) {
			
 
				-    resp->err = strdup("nvml handle sn't initialized");
			
 
				+    resp->err = strdup("nvml handle isn't initialized");
			
 
				     return;
			
 
				   }
			
 
				 
			
@@ -155,15 +155,15 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 
				       }
			
 
				     }
			
 
				 
			
 
				-    LOG(h.verbose, "[%d] CUDA totalMem %llu\n", i, memInfo.total);
			
 
				-    LOG(h.verbose, "[%d] CUDA usedMem %llu\n", i, memInfo.used);
			
 
				+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
			
 
				+    LOG(h.verbose, "[%d] CUDA freeMem %ld\n", i, memInfo.free);
			
 
				 
			
 
				     resp->total += memInfo.total;
			
 
				     resp->free += memInfo.free;
			
 
				   }
			
 
				 }
			
 
				 
			
 
				-void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
			
 
				+void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
			
 
				   resp->err = NULL;
			
 
				   resp->major = 0;
			
 
				   resp->minor = 0;
			
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
@@ -1,6 +1,6 @@
 
				 #ifndef __APPLE__
			
 
				-#ifndef __GPU_INFO_CUDA_H__
			
 
				-#define __GPU_INFO_CUDA_H__
			
 
				+#ifndef __GPU_INFO_NVML_H__
			
 
				+#define __GPU_INFO_NVML_H__
			
 
				 #include "gpu_info.h"
			
 
				 
			
 
				 // Just enough typedef's to dlopen/dlsym for memory information
			
@@ -20,7 +20,7 @@ typedef enum nvmlBrandType_enum
 
				     NVML_BRAND_UNKNOWN          = 0,
			
 
				 } nvmlBrandType_t;
			
 
				 
			
 
				-typedef struct cuda_handle {
			
 
				+typedef struct nvml_handle {
			
 
				   void *handle;
			
 
				   uint16_t verbose;
			
 
				   nvmlReturn_t (*nvmlInit_v2)(void);
			
@@ -35,22 +35,22 @@ typedef struct cuda_handle {
 
				   nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
			
 
				   nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
			
 
				   nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
			
 
				-} cuda_handle_t;
			
 
				+} nvml_handle_t;
			
 
				 
			
 
				-typedef struct cuda_init_resp {
			
 
				+typedef struct nvml_init_resp {
			
 
				   char *err;  // If err is non-null handle is invalid
			
 
				-  cuda_handle_t ch;
			
 
				-} cuda_init_resp_t;
			
 
				+  nvml_handle_t ch;
			
 
				+} nvml_init_resp_t;
			
 
				 
			
 
				-typedef struct cuda_compute_capability {
			
 
				+typedef struct nvml_compute_capability {
			
 
				   char *err;
			
 
				   int major;
			
 
				   int minor;
			
 
				-} cuda_compute_capability_t;
			
 
				+} nvml_compute_capability_t;
			
 
				 
			
 
				-void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp);
			
 
				-void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp);
			
 
				-void cuda_compute_capability(cuda_handle_t ch, cuda_compute_capability_t *cc);
			
 
				+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
			
 
				+void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
			
 
				+void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
			
 
				 
			
 
				-#endif  // __GPU_INFO_CUDA_H__
			
 
				+#endif  // __GPU_INFO_NVML_H__
			
 
				 #endif  // __APPLE__
			
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -39,7 +39,7 @@ init_vars() {
 
				     *)
			
 
				         ;;
			
 
				     esac
			
 
				-    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then 
			
 
				+    if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
			
 
				         CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
			
 
				     fi
			
 
				 }
			
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -90,30 +90,35 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
 
				             compress_libs
			
 
				         fi
			
 
				 
			
 
				-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
			
 
				+        if [ "${ARCH}" == "x86_64" ]; then
			
 
				             #
			
 
				-            # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
			
 
				-            # Approximately 400% faster than LCD on same CPU
			
 
				+            # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
			
 
				             #
			
 
				-            init_vars
			
 
				-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
			
 
				-            echo "Building AVX CPU"
			
 
				-            build
			
 
				-            compress_libs
			
 
				-        fi
			
 
				-
			
 
				-        if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
			
 
				-            #
			
 
				-            # ~2013 CPU Dynamic library
			
 
				-            # Approximately 10% faster than AVX on same CPU
			
 
				-            #
			
 
				-            init_vars
			
 
				-            CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
			
 
				-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
			
 
				-            echo "Building AVX2 CPU"
			
 
				-            build
			
 
				-            compress_libs
			
 
				+            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
			
 
				+                #
			
 
				+                # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
			
 
				+                # Approximately 400% faster than LCD on same CPU
			
 
				+                #
			
 
				+                init_vars
			
 
				+                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				+                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
			
 
				+                echo "Building AVX CPU"
			
 
				+                build
			
 
				+                compress_libs
			
 
				+            fi
			
 
				+
			
 
				+            if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
			
 
				+                #
			
 
				+                # ~2013 CPU Dynamic library
			
 
				+                # Approximately 10% faster than AVX on same CPU
			
 
				+                #
			
 
				+                init_vars
			
 
				+                CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
			
 
				+                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
			
 
				+                echo "Building AVX2 CPU"
			
 
				+                build
			
 
				+                compress_libs
			
 
				+            fi
			
 
				         fi
			
 
				     fi
			
 
				 else
			
@@ -142,12 +147,21 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
 
				     if [ -n "${CUDA_MAJOR}" ]; then
			
 
				         CUDA_VARIANT=_v${CUDA_MAJOR}
			
 
				     fi
			
 
				-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
			
 
				+    if [ "${ARCH}" == "arm64" ]; then
			
 
				+        echo "ARM CPU detected - disabling unsupported AVX instructions"
			
 
				+        
			
 
				+        # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
			
 
				+        #
			
 
				+        # CUDA compute < 6.0 lacks proper FP16 support on ARM. 
			
 
				+        # Disabling has minimal performance effect while maintaining compatibility. 
			
 
				+        ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
			
 
				+    fi
			
 
				+    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
			
 
				     BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
			
 
				     EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
			
 
				     build
			
 
				 
			
 
				-    # Cary the CUDA libs as payloads to help reduce dependency burden on users
			
 
				+    # Carry the CUDA libs as payloads to help reduce dependency burden on users
			
 
				     #
			
 
				     # TODO - in the future we may shift to packaging these separately and conditionally
			
 
				     #        downloading them in the install script.