11 ヶ月前 · 646371f56d
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -16,6 +16,7 @@ import (
 
				 	"os"
			
 
				 	"path/filepath"
			
 
				 	"runtime"
			
 
				+	"strconv"
			
 
				 	"strings"
			
 
				 	"sync"
			
 
				 	"unsafe"
			
@@ -28,6 +29,7 @@ type handles struct {
 
				 	deviceCount int
			
 
				 	cudart      *C.cudart_handle_t
			
 
				 	nvcuda      *C.nvcuda_handle_t
			
 
				+	oneapi      *C.oneapi_handle_t
			
 
				 }
			
 
				 
			
 
				 const (
			
@@ -80,6 +82,15 @@ var NvcudaWindowsGlobs = []string{
 
				 	"c:\\windows\\system*\\nvcuda.dll",
			
 
				 }
			
 
				 
			
 
				+var OneapiWindowsGlobs = []string{
			
 
				+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
			
 
				+}
			
 
				+
			
 
				+var OneapiLinuxGlobs = []string{
			
 
				+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
			
 
				+	"/usr/lib*/libze_intel_gpu.so*",
			
 
				+}
			
 
				+
			
 
				 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
			
 
				 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
			
 
				 var CudaTegra string = os.Getenv("JETSON_JETPACK")
			
@@ -94,6 +105,8 @@ func initGPUHandles() *handles {
 
				 	var cudartMgmtPatterns []string
			
 
				 	var nvcudaMgmtName string
			
 
				 	var nvcudaMgmtPatterns []string
			
 
				+	var oneapiMgmtName string
			
 
				+	var oneapiMgmtPatterns []string
			
 
				 
			
 
				 	tmpDir, _ := PayloadsDir()
			
 
				 	switch runtime.GOOS {
			
@@ -105,6 +118,8 @@ func initGPUHandles() *handles {
 
				 		// Aligned with driver, we can't carry as payloads
			
 
				 		nvcudaMgmtName = "nvcuda.dll"
			
 
				 		nvcudaMgmtPatterns = NvcudaWindowsGlobs
			
 
				+		oneapiMgmtName = "ze_intel_gpu64.dll"
			
 
				+		oneapiMgmtPatterns = OneapiWindowsGlobs
			
 
				 	case "linux":
			
 
				 		cudartMgmtName = "libcudart.so*"
			
 
				 		if tmpDir != "" {
			
@@ -115,6 +130,8 @@ func initGPUHandles() *handles {
 
				 		// Aligned with driver, we can't carry as payloads
			
 
				 		nvcudaMgmtName = "libcuda.so*"
			
 
				 		nvcudaMgmtPatterns = NvcudaLinuxGlobs
			
 
				+		oneapiMgmtName = "libze_intel_gpu.so"
			
 
				+		oneapiMgmtPatterns = OneapiLinuxGlobs
			
 
				 	default:
			
 
				 		return gpuHandles
			
 
				 	}
			
@@ -141,6 +158,18 @@ func initGPUHandles() *handles {
 
				 			return gpuHandles
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
			
 
				+	if len(oneapiLibPaths) > 0 {
			
 
				+		deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths)
			
 
				+		if oneapi != nil {
			
 
				+			slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
			
 
				+			gpuHandles.oneapi = oneapi
			
 
				+			gpuHandles.deviceCount = deviceCount
			
 
				+			return gpuHandles
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	return gpuHandles
			
 
				 }
			
 
				 
			
@@ -181,39 +210,53 @@ func GetGPUInfo() GpuInfoList {
 
				 		if cpuVariant == "" && runtime.GOARCH == "amd64" {
			
 
				 			continue
			
 
				 		}
			
 
				-		gpuInfo := GpuInfo{
			
 
				-			Library: "cuda",
			
 
				-		}
			
 
				-		var driverMajor int
			
 
				-		var driverMinor int
			
 
				-		if gpuHandles.cudart != nil {
			
 
				-			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
			
 
				-		} else {
			
 
				-			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
			
 
				-			driverMajor = int(gpuHandles.nvcuda.driver_major)
			
 
				-			driverMinor = int(gpuHandles.nvcuda.driver_minor)
			
 
				-		}
			
 
				-		if memInfo.err != nil {
			
 
				-			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
			
 
				-			C.free(unsafe.Pointer(memInfo.err))
			
 
				-			continue
			
 
				+		if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
			
 
				+			gpuInfo := GpuInfo{
			
 
				+				Library: "cuda",
			
 
				+			}
			
 
				+			var driverMajor int
			
 
				+			var driverMinor int
			
 
				+			if gpuHandles.cudart != nil {
			
 
				+				C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
			
 
				+			} else {
			
 
				+				C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
			
 
				+				driverMajor = int(gpuHandles.nvcuda.driver_major)
			
 
				+				driverMinor = int(gpuHandles.nvcuda.driver_minor)
			
 
				+			}
			
 
				+			if memInfo.err != nil {
			
 
				+				slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
			
 
				+				C.free(unsafe.Pointer(memInfo.err))
			
 
				+				continue
			
 
				+			}
			
 
				+			if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
			
 
				+				slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
			
 
				+				continue
			
 
				+			}
			
 
				+			gpuInfo.TotalMemory = uint64(memInfo.total)
			
 
				+			gpuInfo.FreeMemory = uint64(memInfo.free)
			
 
				+			gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
			
 
				+			gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
			
 
				+			gpuInfo.MinimumMemory = cudaMinimumMemory
			
 
				+			gpuInfo.DependencyPath = depPath
			
 
				+			gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
			
 
				+			gpuInfo.DriverMajor = int(driverMajor)
			
 
				+			gpuInfo.DriverMinor = int(driverMinor)
			
 
				+
			
 
				+			// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
			
 
				+			resp = append(resp, gpuInfo)
			
 
				 		}
			
 
				-		if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
			
 
				-			slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
			
 
				-			continue
			
 
				+		if gpuHandles.oneapi != nil {
			
 
				+			gpuInfo := GpuInfo{
			
 
				+				Library: "oneapi",
			
 
				+			}
			
 
				+			C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
			
 
				+			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
			
 
				+			memInfo.free = C.uint64_t(totalFreeMem)
			
 
				+			gpuInfo.TotalMemory = uint64(memInfo.total)
			
 
				+			gpuInfo.FreeMemory = uint64(memInfo.free)
			
 
				+			gpuInfo.ID = strconv.Itoa(i)
			
 
				+			resp = append(resp, gpuInfo)
			
 
				 		}
			
 
				-		gpuInfo.TotalMemory = uint64(memInfo.total)
			
 
				-		gpuInfo.FreeMemory = uint64(memInfo.free)
			
 
				-		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
			
 
				-		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
			
 
				-		gpuInfo.MinimumMemory = cudaMinimumMemory
			
 
				-		gpuInfo.DependencyPath = depPath
			
 
				-		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
			
 
				-		gpuInfo.DriverMajor = int(driverMajor)
			
 
				-		gpuInfo.DriverMinor = int(driverMinor)
			
 
				-
			
 
				-		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
			
 
				-		resp = append(resp, gpuInfo)
			
 
				 	}
			
 
				 
			
 
				 	// Then AMD
			
@@ -348,6 +391,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 
				 	return 0, nil, ""
			
 
				 }
			
 
				 
			
 
				+func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
			
 
				+	var resp C.oneapi_init_resp_t
			
 
				+	resp.oh.verbose = getVerboseState()
			
 
				+	for _, libPath := range oneapiLibPaths {
			
 
				+		lib := C.CString(libPath)
			
 
				+		defer C.free(unsafe.Pointer(lib))
			
 
				+		C.oneapi_init(lib, &resp)
			
 
				+		if resp.err != nil {
			
 
				+			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
			
 
				+			C.free(unsafe.Pointer(resp.err))
			
 
				+		} else {
			
 
				+			return int(resp.num_devices), &resp.oh, libPath
			
 
				+		}
			
 
				+	}
			
 
				+	return 0, nil, ""
			
 
				+}
			
 
				+
			
 
				 func getVerboseState() C.uint16_t {
			
 
				 	if envconfig.Debug {
			
 
				 		return C.uint16_t(1)
			
@@ -368,6 +428,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 
				 		return cudaGetVisibleDevicesEnv(l)
			
 
				 	case "rocm":
			
 
				 		return rocmGetVisibleDevicesEnv(l)
			
 
				+	case "oneapi":
			
 
				+		return oneapiGetVisibleDevicesEnv(l)
			
 
				 	default:
			
 
				 		slog.Debug("no filter required for library " + l[0].Library)
			
 
				 		return "", ""
			
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -62,6 +62,7 @@ void cpu_check_ram(mem_info_t *resp);
 
				 
			
 
				 #include "gpu_info_cudart.h"
			
 
				 #include "gpu_info_nvcuda.h"
			
 
				+#include "gpu_info_oneapi.h"
			
 
				 
			
 
				 #endif  // __GPU_INFO_H__
			
 
				 #endif  // __APPLE__
			
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -0,0 +1,214 @@
 
				+#ifndef __APPLE__
			
 
				+
			
 
				+#include "gpu_info_oneapi.h"
			
 
				+
			
 
				+#include <string.h>
			
 
				+
			
 
				+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
			
 
				+{
			
 
				+  ze_result_t ret;
			
 
				+  resp->err = NULL;
			
 
				+  const int buflen = 256;
			
 
				+  char buf[buflen + 1];
			
 
				+  int i;
			
 
				+  struct lookup
			
 
				+  {
			
 
				+    char *s;
			
 
				+    void **p;
			
 
				+  } l[] = {
			
 
				+      {"zesInit", (void *)&resp->oh.zesInit},
			
 
				+      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
			
 
				+      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
			
 
				+      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
			
 
				+      {"zesDeviceEnumMemoryModules",
			
 
				+       (void *)&resp->oh.zesDeviceEnumMemoryModules},
			
 
				+      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
			
 
				+      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
			
 
				+      {NULL, NULL},
			
 
				+  };
			
 
				+
			
 
				+  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
			
 
				+  if (!resp->oh.handle)
			
 
				+  {
			
 
				+    char *msg = LOAD_ERR();
			
 
				+    snprintf(buf, buflen,
			
 
				+             "Unable to load %s library to query for Intel GPUs: %s\n",
			
 
				+             oneapi_lib_path, msg);
			
 
				+    free(msg);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // TODO once we've squashed the remaining corner cases remove this log
			
 
				+  LOG(resp->oh.verbose,
			
 
				+      "wiring Level-Zero management library functions in %s\n",
			
 
				+      oneapi_lib_path);
			
 
				+
			
 
				+  for (i = 0; l[i].s != NULL; i++)
			
 
				+  {
			
 
				+    // TODO once we've squashed the remaining corner cases remove this log
			
 
				+    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
			
 
				+
			
 
				+    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
			
 
				+    if (!l[i].p)
			
 
				+    {
			
 
				+      resp->oh.handle = NULL;
			
 
				+      char *msg = LOAD_ERR();
			
 
				+      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
			
 
				+      UNLOAD_LIBRARY(resp->oh.handle);
			
 
				+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
			
 
				+      free(msg);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  ret = (*resp->oh.zesInit)(0);
			
 
				+  if (ret != ZE_RESULT_SUCCESS)
			
 
				+  {
			
 
				+    LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
			
 
				+    UNLOAD_LIBRARY(resp->oh.handle);
			
 
				+    resp->oh.handle = NULL;
			
 
				+    snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
			
 
				+    resp->err = strdup(buf);
			
 
				+  }
			
 
				+
			
 
				+  (*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
			
 
				+
			
 
				+  return;
			
 
				+}
			
 
				+
			
 
				+void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
			
 
				+{
			
 
				+  ze_result_t ret;
			
 
				+  resp->err = NULL;
			
 
				+  uint64_t totalMem = 0;
			
 
				+  uint64_t usedMem = 0;
			
 
				+  const int buflen = 256;
			
 
				+  char buf[buflen + 1];
			
 
				+  int i, d, m;
			
 
				+
			
 
				+  if (h.handle == NULL)
			
 
				+  {
			
 
				+    resp->err = strdup("Level-Zero handle not initialized");
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  uint32_t driversCount = 0;
			
 
				+  ret = (*h.zesDriverGet)(&driversCount, NULL);
			
 
				+  if (ret != ZE_RESULT_SUCCESS)
			
 
				+  {
			
 
				+    snprintf(buf, buflen, "unable to get driver count: %d", ret);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+  LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
			
 
				+
			
 
				+  zes_driver_handle_t *allDrivers =
			
 
				+      malloc(driversCount * sizeof(zes_driver_handle_t));
			
 
				+  (*h.zesDriverGet)(&driversCount, allDrivers);
			
 
				+
			
 
				+  resp->total = 0;
			
 
				+  resp->free = 0;
			
 
				+
			
 
				+  for (d = 0; d < driversCount; d++)
			
 
				+  {
			
 
				+    uint32_t deviceCount = 0;
			
 
				+    ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
			
 
				+    if (ret != ZE_RESULT_SUCCESS)
			
 
				+    {
			
 
				+      snprintf(buf, buflen, "unable to get device count: %d", ret);
			
 
				+      resp->err = strdup(buf);
			
 
				+      free(allDrivers);
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
			
 
				+
			
 
				+    zes_device_handle_t *devices =
			
 
				+        malloc(deviceCount * sizeof(zes_device_handle_t));
			
 
				+    (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
			
 
				+
			
 
				+    for (i = 0; i < deviceCount; i++)
			
 
				+    {
			
 
				+      zes_device_ext_properties_t ext_props;
			
 
				+      ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
			
 
				+      ext_props.pNext = NULL;
			
 
				+
			
 
				+      zes_device_properties_t props;
			
 
				+      props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
			
 
				+      props.pNext = &ext_props;
			
 
				+
			
 
				+      ret = (*h.zesDeviceGetProperties)(devices[i], &props);
			
 
				+      if (ret != ZE_RESULT_SUCCESS)
			
 
				+      {
			
 
				+        snprintf(buf, buflen, "unable to get device properties: %d", ret);
			
 
				+        resp->err = strdup(buf);
			
 
				+        free(allDrivers);
			
 
				+        free(devices);
			
 
				+        return;
			
 
				+      }
			
 
				+
			
 
				+      if (h.verbose)
			
 
				+      {
			
 
				+        // When in verbose mode, report more information about
			
 
				+        // the card we discover.
			
 
				+        LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
			
 
				+            props.modelName);
			
 
				+        LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
			
 
				+            props.brandName);
			
 
				+        LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
			
 
				+            props.vendorName);
			
 
				+        LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
			
 
				+            props.serialNumber);
			
 
				+        LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
			
 
				+            props.boardNumber);
			
 
				+      }
			
 
				+
			
 
				+      uint32_t memCount = 0;
			
 
				+      ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
			
 
				+      if (ret != ZE_RESULT_SUCCESS)
			
 
				+      {
			
 
				+        snprintf(buf, buflen,
			
 
				+                 "unable to enumerate Level-Zero memory modules: %d", ret);
			
 
				+        resp->err = strdup(buf);
			
 
				+        free(allDrivers);
			
 
				+        free(devices);
			
 
				+        return;
			
 
				+      }
			
 
				+
			
 
				+      LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
			
 
				+
			
 
				+      zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
			
 
				+      (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
			
 
				+
			
 
				+      for (m = 0; m < memCount; m++)
			
 
				+      {
			
 
				+        zes_mem_state_t state;
			
 
				+        state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
			
 
				+        state.pNext = NULL;
			
 
				+        ret = (*h.zesMemoryGetState)(mems[m], &state);
			
 
				+        if (ret != ZE_RESULT_SUCCESS)
			
 
				+        {
			
 
				+          snprintf(buf, buflen, "unable to get memory state: %d", ret);
			
 
				+          resp->err = strdup(buf);
			
 
				+          free(allDrivers);
			
 
				+          free(devices);
			
 
				+          free(mems);
			
 
				+          return;
			
 
				+        }
			
 
				+
			
 
				+        resp->total += state.size;
			
 
				+        resp->free += state.free;
			
 
				+      }
			
 
				+
			
 
				+      free(mems);
			
 
				+    }
			
 
				+
			
 
				+    free(devices);
			
 
				+  }
			
 
				+
			
 
				+  free(allDrivers);
			
 
				+}
			
 
				+
			
 
				+#endif // __APPLE__
			
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@@ -0,0 +1,211 @@
 
				+#ifndef __APPLE__
			
 
				+#ifndef __GPU_INFO_ONEAPI_H__
			
 
				+#define __GPU_INFO_ONEAPI_H__
			
 
				+#include "gpu_info.h"
			
 
				+
			
 
				+#define ZE_MAX_DEVICE_NAME 256
			
 
				+#define ZE_MAX_DEVICE_UUID_SIZE 16
			
 
				+#define ZES_STRING_PROPERTY_SIZE 64
			
 
				+#define ZE_BIT(_i) (1 << _i)
			
 
				+
			
 
				+// Just enough typedef's to dlopen/dlsym for memory information
			
 
				+typedef enum ze_result_t
			
 
				+{
			
 
				+  ZE_RESULT_SUCCESS = 0,
			
 
				+  // Other values omitted for now...
			
 
				+} ze_result_t;
			
 
				+
			
 
				+typedef uint8_t ze_bool_t;
			
 
				+typedef struct _zes_driver_handle_t *zes_driver_handle_t;
			
 
				+typedef struct _zes_device_handle_t *zes_device_handle_t;
			
 
				+typedef struct _zes_mem_handle_t *zes_mem_handle_t;
			
 
				+
			
 
				+typedef enum _ze_structure_type_t
			
 
				+{
			
 
				+  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				+} ze_structure_type_t;
			
 
				+
			
 
				+typedef enum _zes_structure_type_t
			
 
				+{
			
 
				+  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
			
 
				+  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
			
 
				+  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
			
 
				+  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
			
 
				+  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				+} zes_structure_type_t;
			
 
				+
			
 
				+typedef enum _zes_mem_type_t
			
 
				+{
			
 
				+  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				+} zes_mem_type_t;
			
 
				+
			
 
				+typedef enum _zes_mem_loc_t
			
 
				+{
			
 
				+  ZES_MEM_LOC_SYSTEM = 0,
			
 
				+  ZES_MEM_LOC_DEVICE = 1,
			
 
				+  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
			
 
				+} zes_mem_loc_t;
			
 
				+
			
 
				+typedef enum _zes_mem_health_t
			
 
				+{
			
 
				+  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
			
 
				+} zes_mem_health_t;
			
 
				+
			
 
				+typedef struct _ze_device_uuid_t
			
 
				+{
			
 
				+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
			
 
				+} ze_device_uuid_t;
			
 
				+
			
 
				+typedef struct _zes_uuid_t
			
 
				+{
			
 
				+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
			
 
				+} zes_uuid_t;
			
 
				+
			
 
				+typedef enum _ze_device_type_t
			
 
				+{
			
 
				+  ZE_DEVICE_TYPE_GPU = 1,
			
 
				+  ZE_DEVICE_TYPE_CPU = 2,
			
 
				+  ZE_DEVICE_TYPE_FPGA = 3,
			
 
				+  ZE_DEVICE_TYPE_MCA = 4,
			
 
				+  ZE_DEVICE_TYPE_VPU = 5,
			
 
				+  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				+} ze_device_type_t;
			
 
				+
			
 
				+typedef enum _zes_device_type_t
			
 
				+{
			
 
				+  ZES_DEVICE_TYPE_GPU = 1,
			
 
				+  ZES_DEVICE_TYPE_CPU = 2,
			
 
				+  ZES_DEVICE_TYPE_FPGA = 3,
			
 
				+  ZES_DEVICE_TYPE_MCA = 4,
			
 
				+  ZES_DEVICE_TYPE_VPU = 5,
			
 
				+  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				+} zes_device_type_t;
			
 
				+
			
 
				+typedef uint32_t ze_device_property_flags_t;
			
 
				+typedef enum _ze_device_property_flag_t
			
 
				+{
			
 
				+  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
			
 
				+  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
			
 
				+  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
			
 
				+  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
			
 
				+  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
			
 
				+} ze_device_property_flag_t;
			
 
				+
			
 
				+typedef uint32_t zes_device_property_flags_t;
			
 
				+typedef enum _zes_device_property_flag_t
			
 
				+{
			
 
				+  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
			
 
				+  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
			
 
				+  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
			
 
				+  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
			
 
				+  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
			
 
				+} zes_device_property_flag_t;
			
 
				+
			
 
				+typedef struct _ze_device_properties_t
			
 
				+{
			
 
				+  ze_structure_type_t stype;
			
 
				+  void *pNext;
			
 
				+  ze_device_type_t type;
			
 
				+  uint32_t vendorId;
			
 
				+  uint32_t deviceId;
			
 
				+  ze_device_property_flags_t flags;
			
 
				+  uint32_t subdeviceId;
			
 
				+  uint32_t coreClockRate;
			
 
				+  uint64_t maxMemAllocSize;
			
 
				+  uint32_t maxHardwareContexts;
			
 
				+  uint32_t maxCommandQueuePriority;
			
 
				+  uint32_t numThreadsPerEU;
			
 
				+  uint32_t physicalEUSimdWidth;
			
 
				+  uint32_t numEUsPerSubslice;
			
 
				+  uint32_t numSubslicesPerSlice;
			
 
				+  uint32_t numSlices;
			
 
				+  uint64_t timerResolution;
			
 
				+  uint32_t timestampValidBits;
			
 
				+  uint32_t kernelTimestampValidBits;
			
 
				+  ze_device_uuid_t uuid;
			
 
				+  char name[ZE_MAX_DEVICE_NAME];
			
 
				+} ze_device_properties_t;
			
 
				+
			
 
				+typedef struct _zes_device_properties_t
			
 
				+{
			
 
				+  zes_structure_type_t stype;
			
 
				+  void *pNext;
			
 
				+  ze_device_properties_t core;
			
 
				+  uint32_t numSubdevices;
			
 
				+  char serialNumber[ZES_STRING_PROPERTY_SIZE];
			
 
				+  char boardNumber[ZES_STRING_PROPERTY_SIZE];
			
 
				+  char brandName[ZES_STRING_PROPERTY_SIZE];
			
 
				+  char modelName[ZES_STRING_PROPERTY_SIZE];
			
 
				+  char vendorName[ZES_STRING_PROPERTY_SIZE];
			
 
				+  char driverVersion[ZES_STRING_PROPERTY_SIZE];
			
 
				+} zes_device_properties_t;
			
 
				+
			
 
				+typedef struct _zes_device_ext_properties_t
			
 
				+{
			
 
				+  zes_structure_type_t stype;
			
 
				+  void *pNext;
			
 
				+  zes_uuid_t uuid;
			
 
				+  zes_device_type_t type;
			
 
				+  zes_device_property_flags_t flags;
			
 
				+} zes_device_ext_properties_t;
			
 
				+
			
 
				+typedef struct _zes_mem_properties_t
			
 
				+{
			
 
				+  zes_structure_type_t stype;
			
 
				+  void *pNext;
			
 
				+  zes_mem_type_t type;
			
 
				+  ze_bool_t onSubdevice;
			
 
				+  uint32_t subdeviceId;
			
 
				+  zes_mem_loc_t location;
			
 
				+  uint64_t physicalSize;
			
 
				+  int32_t busWidth;
			
 
				+  int32_t numChannels;
			
 
				+} zes_mem_properties_t;
			
 
				+
			
 
				+typedef struct _zes_mem_state_t
			
 
				+{
			
 
				+  zes_structure_type_t stype;
			
 
				+  const void *pNext;
			
 
				+  zes_mem_health_t health;
			
 
				+  uint64_t free;
			
 
				+  uint64_t size;
			
 
				+} zes_mem_state_t;
			
 
				+
			
 
				+typedef struct oneapi_handle
			
 
				+{
			
 
				+  void *handle;
			
 
				+  uint16_t verbose;
			
 
				+  ze_result_t (*zesInit)(int);
			
 
				+  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
			
 
				+  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
			
 
				+                              zes_device_handle_t *phDevices);
			
 
				+  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
			
 
				+                                        zes_device_properties_t *pProperties);
			
 
				+  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
			
 
				+                                            uint32_t *pCount,
			
 
				+                                            zes_mem_handle_t *phMemory);
			
 
				+  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
			
 
				+                                        zes_mem_properties_t *pProperties);
			
 
				+  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
			
 
				+                                   zes_mem_state_t *pState);
			
 
				+
			
 
				+} oneapi_handle_t;
			
 
				+
			
 
				+typedef struct oneapi_init_resp
			
 
				+{
			
 
				+  char *err; // If err is non-null handle is invalid
			
 
				+  int num_devices;
			
 
				+  oneapi_handle_t oh;
			
 
				+} oneapi_init_resp_t;
			
 
				+
			
 
				+typedef struct oneapi_version_resp
			
 
				+{
			
 
				+  ze_result_t status;
			
 
				+  char *str; // Contains version or error string if status != 0
			
 
				+} oneapi_version_resp_t;
			
 
				+
			
 
				+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
			
 
				+void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
			
 
				+
			
 
				+#endif // __GPU_INFO_INTEL_H__
			
 
				+#endif // __APPLE__
			
--- a/gpu/gpu_oneapi.go
+++ b/gpu/gpu_oneapi.go
@@ -0,0 +1,21 @@
 
				+//go:build linux || windows
			
 
				+
			
 
				+package gpu
			
 
				+
			
 
				+import (
			
 
				+	"log/slog"
			
 
				+	"strings"
			
 
				+)
			
 
				+
			
 
				+func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
			
 
				+	ids := []string{}
			
 
				+	for _, info := range gpuInfo {
			
 
				+		if info.Library != "oneapi" {
			
 
				+			// TODO shouldn't happen if things are wired correctly...
			
 
				+			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
			
 
				+			continue
			
 
				+		}
			
 
				+		ids = append(ids, info.ID)
			
 
				+	}
			
 
				+	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
			
 
				+}
			
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -206,6 +206,36 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
 
				 
			
 
				 fi
			
 
				 
			
 
				+if [ -z "${ONEAPI_ROOT}" ]; then
			
 
				+    # Try the default location in case it exists
			
 
				+    ONEAPI_ROOT=/opt/intel/oneapi
			
 
				+fi
			
 
				+
			
 
				+if [ -d "${ONEAPI_ROOT}" ]; then
			
 
				+    echo "OneAPI libraries detected - building dynamic OneAPI library"
			
 
				+    init_vars
			
 
				+    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
			
 
				+    CC=icx
			
 
				+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL=ON -DLLAMA_SYCL_F16=OFF"
			
 
				+    BUILD_DIR="../build/linux/${ARCH}/oneapi"
			
 
				+    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
			
 
				+    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
			
 
				+    build
			
 
				+
			
 
				+    # copy oneAPI dependencies
			
 
				+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
			
 
				+        cp "${dep}" "${BUILD_DIR}/bin/"
			
 
				+    done
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
			
 
				+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
			
 
				+    compress
			
 
				+fi
			
 
				+
			
 
				 if [ -z "${ROCM_PATH}" ]; then
			
 
				     # Try the default location in case it exists
			
 
				     ROCM_PATH=/opt/rocm
			
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -289,6 +289,49 @@ function build_cuda() {
 
				     }
			
 
				 }
			
 
				 
			
 
				+function build_oneapi() {
			
 
				+  if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
			
 
				+    # Get oneAPI version
			
 
				+    $script:ONEAPI_VERSION = icpx --version
			
 
				+    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
			
 
				+    if ($null -ne $script:ONEAPI_VERSION) {
			
 
				+      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
			
 
				+    }
			
 
				+    init_vars
			
 
				+    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
			
 
				+    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
			
 
				+    $script:cmakeDefs += @(
			
 
				+      "-G", "MinGW Makefiles",
			
 
				+      "-DLLAMA_SYCL=ON",
			
 
				+      "-DCMAKE_C_COMPILER=icx",
			
 
				+      "-DCMAKE_CXX_COMPILER=icx",
			
 
				+      "-DCMAKE_BUILD_TYPE=Release"
			
 
				+    )
			
 
				+
			
 
				+    Write-Host "Building oneAPI"
			
 
				+    build
			
 
				+    # Ninja doesn't prefix with config name
			
 
				+    if ($null -ne $script:DUMPBIN) {
			
 
				+      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
			
 
				+    }
			
 
				+    sign
			
 
				+    install
			
 
				+
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
			
 
				+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
			
 
				+  } else {
			
 
				+    Write-Host "Skipping oneAPI generation step"
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 function build_rocm() {
			
 
				     if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
			
 
				         $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
			
@@ -356,6 +399,7 @@ if ($($args.count) -eq 0) {
 
				         build_cpu_avx
			
 
				         build_cpu_avx2
			
 
				         build_cuda
			
 
				+        build_oneapi
			
 
				         build_rocm
			
 
				     }