1 年之前 · b24e8d17b2
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -184,10 +184,11 @@ func getCPUMem() (memInfo, error) {
 
				 func CheckVRAM() (int64, error) {
			
 
				 	gpuInfo := GetGPUInfo()
			
 
				 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
			
 
				-		// leave 10% or 384Mi of VRAM free for unaccounted for overhead
			
 
				-		overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
			
 
				-		if overhead < 384*1024*1024 {
			
 
				-			overhead = 384 * 1024 * 1024
			
 
				+		// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead
			
 
				+		overhead := gpuInfo.FreeMemory / 10
			
 
				+		gpus := uint64(gpuInfo.DeviceCount)
			
 
				+		if overhead < gpus*512*1024*1024 {
			
 
				+			overhead = gpus * 512 * 1024 * 1024
			
 
				 		}
			
 
				 		return int64(gpuInfo.FreeMemory - overhead), nil
			
 
				 	}
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 				break
			
 
				 			}
			
 
				 
			
 
				-			// no offloading required
			
 
				-			if requiredTotal <= available {
			
 
				-				break
			
 
				-			}
			
 
				-
			
 
				-			// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
			
 
				-			if requiredAlloc > available {
			
 
				+			// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
			
 
				+			// TODO: find the largest GPU and only reserve memory there
			
 
				+			avgAvailable := available / int64(info.DeviceCount)
			
 
				+			if requiredAlloc > avgAvailable {
			
 
				 				log.Printf("not enough vram available, falling back to CPU only")
			
 
				 				library = "cpu"
			
 
				 				opts.NumGPU = 0
			
 
				 				break
			
 
				 			}
			
 
				 
			
 
				-			available -= requiredAlloc
			
 
				+			// we don't know which GPU will be used, so estimate
			
 
				+			// the scratch buffer space on all of them
			
 
				+			// TODO: allocate less layers to the GPU with the scratch buffer
			
 
				+			// and more to the others (based on their available memory)
			
 
				+			available -= requiredAlloc * int64(info.DeviceCount)
			
 
				+
			
 
				+			// no offloading required
			
 
				+			if requiredModel+requiredKv <= available {
			
 
				+				break
			
 
				+			}
			
 
				 
			
 
				 			// fill remaining vram with layers
			
 
				 			log.Println("splitting", available, "of available memory bytes into layers")