1 year ago · 41a272de9f
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
 
				 		// gpu not supported, this may not be metal
			
 
				 		return 0, nil
			
 
				 	}
			
 
				+
			
 
				 	return uint64(C.getRecommendedMaxVRAM()), nil
			
 
				 }
			
 
				 
			
@@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
 
				 
			
 
				 func getCPUMem() (memInfo, error) {
			
 
				 	return memInfo{
			
 
				-		TotalMemory: 0,
			
 
				+		TotalMemory: uint64(C.getPhysicalMemory()),
			
 
				 		FreeMemory:  0,
			
 
				 		DeviceCount: 0,
			
 
				 	}, nil
			
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@@ -1,3 +1,4 @@
 
				 #import <Metal/Metal.h>
			
 
				 #include <stdint.h>
			
 
				 uint64_t getRecommendedMaxVRAM();
			
 
				+uint64_t getPhysicalMemory();
			
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@@ -1,11 +1,13 @@
 
				-//go:build darwin
			
 
				+// go:build darwin
			
 
				 #include "gpu_info_darwin.h"
			
 
				 
			
 
				-uint64_t getRecommendedMaxVRAM()
			
 
				-{
			
 
				-	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
			
 
				-	uint64_t result = device.recommendedMaxWorkingSetSize;
			
 
				-	CFRelease(device);
			
 
				-	return result;
			
 
				+uint64_t getRecommendedMaxVRAM() {
			
 
				+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
			
 
				+  uint64_t result = device.recommendedMaxWorkingSetSize;
			
 
				+  CFRelease(device);
			
 
				+  return result;
			
 
				 }
			
 
				 
			
 
				+uint64_t getPhysicalMemory() {
			
 
				+  return [[NSProcessInfo processInfo] physicalMemory];
			
 
				+}
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 
				 
			
 
				 	memoryLayerOutput := layers["output"].size()
			
 
				 	memoryRequiredTotal += memoryLayerOutput
			
 
				-	if memoryAvailable > memoryRequiredTotal {
			
 
				+
			
 
				+	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
			
 
				+		// disable partial offloading when model is greater than total system memory
			
 
				+		opts.NumGPU = 0
			
 
				+	} else if memoryAvailable > memoryRequiredTotal {
			
 
				 		layerCount = int(ggml.KV().BlockCount()) + 1
			
 
				 		memoryRequiredPartial = memoryRequiredTotal
			
 
				 	}