11 mēneši atpakaļ · 0d365e8d34
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -31,6 +31,7 @@ package llama
 
				 // #include "sampling_ext.h"
			
 
				 import "C"
			
 
				 import (
			
 
				+	"errors"
			
 
				 	"fmt"
			
 
				 	"runtime"
			
 
				 	"strings"
			
@@ -49,13 +50,14 @@ type ContextParams struct {
 
				 	c C.struct_llama_context_params
			
 
				 }
			
 
				 
			
 
				-func NewContextParams() ContextParams {
			
 
				+func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParams {
			
 
				 	params := C.llama_context_default_params()
			
 
				-	params.seed = C.uint(1234)
			
 
				-	params.n_ctx = C.uint(2048)
			
 
				+	params.n_ctx = C.uint(numCtx)
			
 
				 	params.n_threads = C.uint(runtime.NumCPU())
			
 
				 	params.n_threads_batch = params.n_threads
			
 
				 	params.embeddings = C.bool(true)
			
 
				+	params.flash_attn = C.bool(flashAttention)
			
 
				+	params.n_threads = C.uint(threads)
			
 
				 	return ContextParams{c: params}
			
 
				 }
			
 
				 
			
@@ -63,9 +65,10 @@ type ModelParams struct {
 
				 	c C.struct_llama_model_params
			
 
				 }
			
 
				 
			
 
				-func NewModelParams() ModelParams {
			
 
				+func NewModelParams(numGpuLayers int, mainGpu int) ModelParams {
			
 
				 	params := C.llama_model_default_params()
			
 
				-	params.n_gpu_layers = 999
			
 
				+	params.n_gpu_layers = C.int(numGpuLayers)
			
 
				+	params.main_gpu = C.int32_t(mainGpu)
			
 
				 	return ModelParams{c: params}
			
 
				 }
			
 
				 
			
@@ -155,6 +158,23 @@ func (m *Model) TokenIsEog(token int) bool {
 
				 	return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
			
 
				 }
			
 
				 
			
 
				+func (m *Model) ApplyLoraFromFile(loraPath string, scale float32, baseModelPath string, threads int) error {
			
 
				+	cLoraPath := C.CString(loraPath)
			
 
				+	defer C.free(unsafe.Pointer(cLoraPath))
			
 
				+
			
 
				+	var cBaseModelPath *C.char
			
 
				+	if baseModelPath != "" {
			
 
				+		cBaseModelPath = C.CString(baseModelPath)
			
 
				+	}
			
 
				+
			
 
				+	code := int(C.llama_model_apply_lora_from_file(m.c, cLoraPath, C.float(scale), cBaseModelPath, C.int32_t(threads)))
			
 
				+	if code != 0 {
			
 
				+		return errors.New("error applying lora from file")
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				 type Batch struct {
			
 
				 	c C.struct_llama_batch
			
 
				 }
			
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -9,6 +9,7 @@ import (
 
				 	"log/slog"
			
 
				 	"net"
			
 
				 	"net/http"
			
 
				+	"runtime"
			
 
				 	"strconv"
			
 
				 	"strings"
			
 
				 	"sync"
			
@@ -73,6 +74,8 @@ type Server struct {
 
				 	lc    *llama.Context
			
 
				 	cc    *llama.ClipContext
			
 
				 
			
 
				+	batchSize int
			
 
				+
			
 
				 	// parallel is the number of parallel requests to handle
			
 
				 	parallel int
			
 
				 
			
@@ -154,7 +157,7 @@ func truncateStop(pieces []string, stop string) []string {
 
				 }
			
 
				 
			
 
				 func (s *Server) run(ctx context.Context) {
			
 
				-	batch := llama.NewBatch(512, 0, s.parallel)
			
 
				+	batch := llama.NewBatch(s.batchSize, 0, s.parallel)
			
 
				 	defer batch.Free()
			
 
				 
			
 
				 	// build up stop sequences as we recognize them
			
@@ -182,7 +185,7 @@ func (s *Server) run(ctx context.Context) {
 
				 
			
 
				 				for j, t := range seq.tokens {
			
 
				 					// todo: make this n_batch
			
 
				-					if j > 512 {
			
 
				+					if j > s.batchSize {
			
 
				 						break
			
 
				 					}
			
 
				 
			
@@ -207,10 +210,10 @@ func (s *Server) run(ctx context.Context) {
 
				 
			
 
				 				// don't sample prompt processing
			
 
				 				if seq.prompt() {
			
 
				-					if len(seq.tokens) < 512 {
			
 
				+					if len(seq.tokens) < s.batchSize {
			
 
				 						seq.tokens = []int{}
			
 
				 					} else {
			
 
				-						seq.tokens = seq.tokens[512:]
			
 
				+						seq.tokens = seq.tokens[s.batchSize:]
			
 
				 					}
			
 
				 
			
 
				 					continue
			
@@ -412,14 +415,26 @@ func main() {
 
				 	mpath := flag.String("model", "", "Path to model binary file")
			
 
				 	ppath := flag.String("projector", "", "Path to projector binary file")
			
 
				 	parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
			
 
				+	batchSize := flag.Int("batch-size", 512, "Batch size")
			
 
				+	nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
			
 
				+	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
			
 
				+	flashAttention := flag.Bool("flash-attention", false, "Enable flash attention")
			
 
				+	numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size")
			
 
				+	lpath := flag.String("lora", "", "Path to lora layer file")
			
 
				 	port := flag.Int("port", 8080, "Port to expose the server on")
			
 
				+	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
			
 
				 	flag.Parse()
			
 
				 
			
 
				 	// load the model
			
 
				 	llama.BackendInit()
			
 
				-	params := llama.NewModelParams()
			
 
				+	params := llama.NewModelParams(*nGpuLayers, *mainGpu)
			
 
				 	model := llama.LoadModelFromFile(*mpath, params)
			
 
				-	ctxParams := llama.NewContextParams()
			
 
				+
			
 
				+	if *lpath != "" {
			
 
				+		model.ApplyLoraFromFile(*lpath, 1.0, "", *threads)
			
 
				+	}
			
 
				+
			
 
				+	ctxParams := llama.NewContextParams(*numCtx, *threads, *flashAttention)
			
 
				 	lc := llama.NewContextWithModel(model, ctxParams)
			
 
				 	if lc == nil {
			
 
				 		panic("Failed to create context")
			
@@ -434,11 +449,12 @@ func main() {
 
				 	}
			
 
				 
			
 
				 	server := &Server{
			
 
				-		model:    model,
			
 
				-		lc:       lc,
			
 
				-		cc:       cc,
			
 
				-		parallel: *parallel,
			
 
				-		seqs:     make([]*Sequence, *parallel),
			
 
				+		model:     model,
			
 
				+		lc:        lc,
			
 
				+		cc:        cc,
			
 
				+		batchSize: *batchSize,
			
 
				+		parallel:  *parallel,
			
 
				+		seqs:      make([]*Sequence, *parallel),
			
 
				 	}
			
 
				 
			
 
				 	server.cond = sync.NewCond(&server.mu)