浏览代码

use 10% vram overhead for cuda

Jeffrey Morgan 1 年之前
父节点
当前提交
cb534e6ac2
共有 2 个文件被更改,包括 6 次插入4 次删除
  1. 5 4
      gpu/gpu.go
  2. 1 0
      llm/llm.go

+ 5 - 4
gpu/gpu.go

@@ -131,10 +131,11 @@ func getCPUMem() (memInfo, error) {
 func CheckVRAM() (int64, error) {
 func CheckVRAM() (int64, error) {
 	gpuInfo := GetGPUInfo()
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		// allocate 384MiB for llama.cpp overhead (outside of model)
-		overhead := uint64(384 * 1024 * 1024)
-		if gpuInfo.FreeMemory <= overhead {
-			return 0, nil
+		// leave 10% or 400MiB of VRAM free for overhead
+		overhead := gpuInfo.FreeMemory / 10
+		minOverhead := 400 * 1024 * 1024
+		if overhead < minOverhead {
+			overhead = minOverhead
 		}
 		}
 
 
 		return int64(gpuInfo.FreeMemory - overhead), nil
 		return int64(gpuInfo.FreeMemory - overhead), nil

+ 1 - 0
llm/llm.go

@@ -117,6 +117,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 			bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
 			bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
 			log.Println("bytes per layer:", bytesPerLayer)
 			log.Println("bytes per layer:", bytesPerLayer)
 			layers := available / bytesPerLayer
 			layers := available / bytesPerLayer
+			log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
 			if layers < int64(opts.NumGPU) {
 			if layers < int64(opts.NumGPU) {
 				opts.NumGPU = int(layers)
 				opts.NumGPU = int(layers)
 			}
 			}