|
@@ -734,7 +734,10 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
|
|
|
|
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
|
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
|
|
- *numParallel = 1
|
|
|
+ if *numParallel <= 0 {
|
|
|
+ *numParallel = 1
|
|
|
+ req.opts.NumCtx = req.origNumCtx
|
|
|
+ }
|
|
|
byLibrary := gpus.ByLibrary()
|
|
|
if len(byLibrary) <= 1 {
|
|
|
return gpus
|