|
@@ -339,6 +339,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, numLayers
|
|
"--model", model,
|
|
"--model", model,
|
|
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
|
"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
|
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
|
|
|
+ "--main-gpu", fmt.Sprintf("%d", opts.MainGPU),
|
|
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
|
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
|
"--embedding",
|
|
"--embedding",
|
|
}
|
|
}
|
|
@@ -544,6 +545,7 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
|
|
"stream": true,
|
|
"stream": true,
|
|
"n_predict": llm.NumPredict,
|
|
"n_predict": llm.NumPredict,
|
|
"n_keep": llm.NumKeep,
|
|
"n_keep": llm.NumKeep,
|
|
|
|
+ "main_gpu": llm.MainGPU,
|
|
"temperature": llm.Temperature,
|
|
"temperature": llm.Temperature,
|
|
"top_k": llm.TopK,
|
|
"top_k": llm.TopK,
|
|
"top_p": llm.TopP,
|
|
"top_p": llm.TopP,
|