Browse Source

darwin: no partial offloading if required memory greater than system

Michael Yang 1 year ago
parent
commit
41a272de9f
4 changed files with 17 additions and 9 deletions
  1. 2 1
      gpu/gpu_darwin.go
  2. 1 0
      gpu/gpu_info_darwin.h
  3. 9 7
      gpu/gpu_info_darwin.m
  4. 5 1
      llm/server.go

+ 2 - 1
gpu/gpu_darwin.go

@@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
+
 	return uint64(C.getRecommendedMaxVRAM()), nil
 }
 
@@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
 
 func getCPUMem() (memInfo, error) {
 	return memInfo{
-		TotalMemory: 0,
+		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
 		DeviceCount: 0,
 	}, nil

+ 1 - 0
gpu/gpu_info_darwin.h

@@ -1,3 +1,4 @@
 #import <Metal/Metal.h>
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
+uint64_t getPhysicalMemory();

+ 9 - 7
gpu/gpu_info_darwin.m

@@ -1,11 +1,13 @@
-//go:build darwin
+// go:build darwin
 #include "gpu_info_darwin.h"
 
-uint64_t getRecommendedMaxVRAM()
-{
-	id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-	uint64_t result = device.recommendedMaxWorkingSetSize;
-	CFRelease(device);
-	return result;
+uint64_t getRecommendedMaxVRAM() {
+  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+  uint64_t result = device.recommendedMaxWorkingSetSize;
+  CFRelease(device);
+  return result;
 }
 
+uint64_t getPhysicalMemory() {
+  return [[NSProcessInfo processInfo] physicalMemory];
+}

+ 5 - 1
llm/server.go

@@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 
 	memoryLayerOutput := layers["output"].size()
 	memoryRequiredTotal += memoryLayerOutput
-	if memoryAvailable > memoryRequiredTotal {
+
+	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
+		// disable partial offloading when model is greater than total system memory
+		opts.NumGPU = 0
+	} else if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}