|
@@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|
graphPartialOffload = graphFullOffload
|
|
graphPartialOffload = graphFullOffload
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ layers := ggml.Tensors().Layers()
|
|
|
|
+
|
|
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
|
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
|
- memoryRequiredTotal := memoryMinimum + graphFullOffload
|
|
|
|
|
|
+ memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
|
|
|
|
|
|
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
|
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
|
- memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
|
|
|
|
|
+ memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
|
|
|
|
|
if memoryRequiredPartial > memoryAvailable {
|
|
if memoryRequiredPartial > memoryAvailable {
|
|
slog.Debug("insufficient VRAM to load any model layers")
|
|
slog.Debug("insufficient VRAM to load any model layers")
|
|
return 0, 0
|
|
return 0, 0
|
|
}
|
|
}
|
|
|
|
|
|
- layers := ggml.Tensors().Layers()
|
|
|
|
-
|
|
|
|
var memoryLayerOutput uint64
|
|
var memoryLayerOutput uint64
|
|
if layer, ok := layers["output_norm"]; ok {
|
|
if layer, ok := layers["output_norm"]; ok {
|
|
memoryLayerOutput += layer.size()
|
|
memoryLayerOutput += layer.size()
|