преди 11 месеца · 72be8e27c4
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -28,6 +28,7 @@ package llama
 
															 // #include "llama.h"
														
 
															 // #include "clip.h"
														
 
															 // #include "llava.h"
														
 
															+// #include "sampling_ext.h"
														
 
															 import "C"
														
 
															 import (
														
 
															 	"fmt"
														
@@ -244,6 +245,7 @@ func Quantize(infile, outfile string, ftype llm.FileType) error {
 
															 	return nil
														
 
															 }
														
 
															+// llava
														
 
															 type ClipContext struct {
														
 
															 	c *C.struct_clip_ctx
														
 
															 }
														
@@ -270,3 +272,65 @@ func NewLlavaImageEmbed(clipContext *ClipContext, data []byte) *LlavaImageEmbed
 
															 func LlavaEvalImageEmbed(llamaContext *Context, embed *LlavaImageEmbed, nBatch int, nPast *int) {
														
 
															 	C.llava_eval_image_embed(llamaContext.c, embed.c, C.int(nBatch), (*C.int)(unsafe.Pointer(nPast)))
														
 
															 }
														
 
															+
														
 
															+// sampling
														
 
															+// TODO: this is a temporary wrapper to allow calling C++ code from CGo
														
 
															+type SamplingContext struct {
														
 
															+	c *C.struct_llama_sampling_context
														
 
															+}
														
 
															+
														
 
															+type SamplingParams struct {
														
 
															+	TopK           int
														
 
															+	TopP           float32
														
 
															+	TfsZ           float32
														
 
															+	TypicalP       float32
														
 
															+	Temp           float32
														
 
															+	PenaltyRepeat  float32
														
 
															+	PenaltyFreq    float32
														
 
															+	PenaltyPresent float32
														
 
															+	Mirostat       int
														
 
															+	MirostatTau    float32
														
 
															+	MirostatEta    float32
														
 
															+	PenalizeNl     bool
														
 
															+	Seed           uint32
														
 
															+}
														
 
															+
														
 
															+func NewSamplingContext(params SamplingParams) *SamplingContext {
														
 
															+	var cparams C.struct_llama_sampling_cparams
														
 
															+	cparams.top_k = C.int32_t(params.TopK)
														
 
															+	cparams.top_p = C.float(params.TopP)
														
 
															+	cparams.tfs_z = C.float(params.TfsZ)
														
 
															+	cparams.typical_p = C.float(params.TypicalP)
														
 
															+	cparams.temp = C.float(params.Temp)
														
 
															+	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
														
 
															+	cparams.penalty_freq = C.float(params.PenaltyFreq)
														
 
															+	cparams.penalty_present = C.float(params.PenaltyFreq)
														
 
															+	cparams.mirostat = C.int32_t(params.Mirostat)
														
 
															+	cparams.mirostat_tau = C.float(params.MirostatTau)
														
 
															+	cparams.mirostat_eta = C.float(params.MirostatEta)
														
 
															+	cparams.penalize_nl = C.bool(params.PenalizeNl)
														
 
															+	cparams.seed = C.uint32_t(params.Seed)
														
 
															+	return &SamplingContext{c: C.llama_sampling_cinit(&cparams)}
														
 
															+}
														
 
															+
														
 
															+func (s *SamplingContext) Free() {
														
 
															+	C.llama_sampling_cfree(s.c)
														
 
															+}
														
 
															+
														
 
															+func (s *SamplingContext) Reset() {
														
 
															+	C.llama_sampling_creset(s.c)
														
 
															+}
														
 
															+
														
 
															+func (s *SamplingContext) Sample(ctxMain *Context, ctxConfig *Context, idx int) int {
														
 
															+	// TODO (jmorganca): handle nil for all args
														
 
															+	if ctxConfig == nil {
														
 
															+		return int(C.llama_sampling_csample(s.c, ctxMain.c, nil, C.int(idx)))
														
 
															+	}
														
 
															+
														
 
															+	return int(C.llama_sampling_csample(s.c, ctxMain.c, ctxConfig.c, C.int(idx)))
														
 
															+
														
 
															+}
														
 
															+
														
 
															+func (s *SamplingContext) Accept(ctxMain *Context, id int, applyGrammar bool) {
														
 
															+	C.llama_sampling_caccept(s.c, ctxMain.c, C.llama_token(id), C.bool(applyGrammar))
														
 
															+}
														
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -24,6 +24,8 @@ type Sequence struct {
 
															 	tokens []int
														
 
															 	responses chan string
														
 
															+
														
 
															+	samplingCtx *llama.SamplingContext
														
 
															 }
														
 
															 // prompt returns true if the prompt is still being processed
														
@@ -31,15 +33,41 @@ func (s *Sequence) prompt() bool {
 
															 	return s.nPast < len(s.tokens)-1
														
 
															 }
														
 
															-func (s *Server) NewSequence(text string, w http.ResponseWriter) *Sequence {
														
 
															-	tokens, err := s.lc.Model().Tokenize(text, 2048, true, true)
														
 
															+func DefaultParams() llama.SamplingParams {
														
 
															+	return llama.SamplingParams{}
														
 
															+}
														
 
															+
														
 
															+func (s *Server) NewSequence(r Request, w http.ResponseWriter) *Sequence {
														
 
															+	var samplingParams llama.SamplingParams
														
 
															+	samplingParams.TopK = r.TopK
														
 
															+	samplingParams.TopP = r.TopP
														
 
															+	samplingParams.TfsZ = r.TFSZ
														
 
															+	samplingParams.TypicalP = r.TypicalP
														
 
															+	samplingParams.Temp = r.Temperature
														
 
															+	samplingParams.PenaltyRepeat = r.RepeatPenalty
														
 
															+	samplingParams.PenaltyFreq = r.FrequencyPenalty
														
 
															+	samplingParams.PenaltyPresent = r.PresencePenalty
														
 
															+	samplingParams.Mirostat = r.Mirostat
														
 
															+	samplingParams.MirostatTau = r.MirostatTau
														
 
															+	samplingParams.MirostatEta = r.MirostatEta
														
 
															+	samplingParams.PenalizeNl = r.PenalizeNewline
														
 
															+	samplingParams.Seed = uint32(r.Seed)
														
 
															+
														
 
															+	tokens, err := s.lc.Model().Tokenize(r.Prompt, 2048, false, true)
														
 
															 	if err != nil {
														
 
															 		panic(err)
														
 
															 	}
														
 
															+	sc := llama.NewSamplingContext(samplingParams)
														
 
															+
														
 
															+	for _, t := range tokens {
														
 
															+		sc.Accept(s.lc, t, false)
														
 
															+	}
														
 
															+
														
 
															 	return &Sequence{
														
 
															-		tokens:    tokens,
														
 
															-		responses: make(chan string, 1),
														
 
															+		tokens:      tokens,
														
 
															+		responses:   make(chan string, 1),
														
 
															+		samplingCtx: sc,
														
 
															 	}
														
 
															 }
														
@@ -80,7 +108,6 @@ func (s *Server) run(ctx context.Context) {
 
															 			slog.Info("Processing batch", "seqs", len(s.seqs))
														
 
															 			s.mu.Lock()
														
 
															 			for s.allNil() {
														
 
															-				fmt.Println("wait")
														
 
															 				s.cond.Wait() // Wait until an item is added
														
 
															 			}
														
 
															 			s.mu.Unlock()
														
@@ -133,8 +160,16 @@ func (s *Server) run(ctx context.Context) {
 
															 				// sample a token
														
 
															 				// TODO: sample based on the sequence
														
 
															 				fmt.Println("Sampling token", i, ibatch[i])
														
 
															-				logits := s.lc.GetLogitsIth(ibatch[i])
														
 
															-				token := s.lc.SampleTokenGreedy(logits)
														
 
															+				fmt.Println("calling sample", s.lc, nil, ibatch[i])
														
 
															+				token := seq.samplingCtx.Sample(s.lc, nil, ibatch[i])
														
 
															+				seq.samplingCtx.Accept(s.lc, token, true)
														
 
															+
														
 
															+				// logits := s.lc.GetLogitsIth(ibatch[i])
														
 
															+				// token := s.lc.SampleTokenGreedy(logits)
														
 
															+				fmt.Println("sampled", token, s.model.TokenToPiece(token))
														
 
															+
														
 
															+				seq.responses <- s.model.TokenToPiece(token)
														
 
															+				seq.tokens = []int{token}
														
 
															 				// if it's an end of sequence token, break
														
 
															 				// TODO: just end this sequence
														
@@ -145,9 +180,6 @@ func (s *Server) run(ctx context.Context) {
 
															 					s.seqs[i] = nil
														
 
															 					continue
														
 
															 				}
														
 
															-
														
 
															-				seq.responses <- s.model.TokenToPiece(token)
														
 
															-				seq.tokens = []int{token}
														
 
															 			}
														
 
															 			batch.Clear()
														
@@ -168,6 +200,7 @@ type Response struct {
 
															 func (s *Server) handler(w http.ResponseWriter, r *http.Request) {
														
 
															 	var request Request
														
 
															+	request.Options = api.DefaultOptions()
														
 
															 	if err := json.NewDecoder(r.Body).Decode(&request); err != nil {
														
 
															 		http.Error(w, "Bad request", http.StatusBadRequest)
														
 
															 		return
														
@@ -178,7 +211,7 @@ func (s *Server) handler(w http.ResponseWriter, r *http.Request) {
 
															 	w.Header().Set("Transfer-Encoding", "chunked")
														
 
															 	w.WriteHeader(http.StatusOK)
														
 
															-	seq := s.NewSequence(request.Prompt, w)
														
 
															+	seq := s.NewSequence(request, w)
														
 
															 	s.mu.Lock()
														
 
															 	for i, sq := range s.seqs {
														
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -0,0 +1,45 @@
 
															+// TODO: this is a temporary wrapper to allow calling C++ code from CGo
														
 
															+#include "sampling.h"
														
 
															+#include "sampling_ext.h"
														
 
															+
														
 
															+struct llama_sampling_context* llama_sampling_cinit(struct llama_sampling_cparams *params) {
														
 
															+    llama_sampling_params sparams;
														
 
															+    sparams.top_k = params->top_k;
														
 
															+    sparams.top_p = params->top_p;
														
 
															+    sparams.tfs_z = params->tfs_z;
														
 
															+    sparams.typical_p = params->typical_p;
														
 
															+    sparams.temp = params->temp;
														
 
															+    sparams.penalty_repeat = params->penalty_repeat;
														
 
															+    sparams.penalty_freq = params->penalty_freq;
														
 
															+    sparams.penalty_present = params->penalty_present;
														
 
															+    sparams.mirostat = params->mirostat;
														
 
															+    sparams.mirostat_tau = params->mirostat_tau;
														
 
															+    sparams.mirostat_eta = params->mirostat_eta;
														
 
															+    sparams.penalize_nl = params->penalize_nl;
														
 
															+    sparams.seed = params->seed;
														
 
															+    return llama_sampling_init(sparams);
														
 
															+}
														
 
															+
														
 
															+void llama_sampling_cfree(struct llama_sampling_context * ctx){
														
 
															+    llama_sampling_free(ctx);
														
 
															+}
														
 
															+
														
 
															+void llama_sampling_creset(struct llama_sampling_context * ctx){
														
 
															+    llama_sampling_reset(ctx);
														
 
															+}
														
 
															+
														
 
															+llama_token llama_sampling_csample(
														
 
															+        struct llama_sampling_context * ctx_sampling,
														
 
															+        struct llama_context * ctx_main,
														
 
															+        struct llama_context * ctx_cfg,
														
 
															+        int idx) {
														
 
															+    return llama_sampling_sample(ctx_sampling, ctx_main, ctx_cfg, idx);
														
 
															+}
														
 
															+
														
 
															+void llama_sampling_caccept(
														
 
															+        struct llama_sampling_context * ctx_sampling,
														
 
															+        struct llama_context * ctx_main,
														
 
															+        llama_token id,
														
 
															+        bool apply_grammar) {
														
 
															+    llama_sampling_accept(ctx_sampling, ctx_main, id, apply_grammar);
														
 
															+}
														
--- a/llama/sampling_ext.h
+++ b/llama/sampling_ext.h
@@ -0,0 +1,47 @@
 
															+// TODO: this is a temporary wrapper to allow calling C++ code from CGo
														
 
															+#ifndef LLAMA_SAMPLING_EXT_H
														
 
															+#define LLAMA_SAMPLING_EXT_H
														
 
															+
														
 
															+#include "llama.h"
														
 
															+
														
 
															+#ifdef __cplusplus
														
 
															+extern "C" {
														
 
															+#endif
														
 
															+
														
 
															+struct llama_sampling_cparams {
														
 
															+    int32_t     top_k;
														
 
															+    float       top_p;
														
 
															+    float       tfs_z;
														
 
															+    float       typical_p;
														
 
															+    float       temp;
														
 
															+    float       penalty_repeat;
														
 
															+    float       penalty_freq;
														
 
															+    float       penalty_present;
														
 
															+    int32_t     mirostat;
														
 
															+    float       mirostat_tau;
														
 
															+    float       mirostat_eta;
														
 
															+    bool        penalize_nl;
														
 
															+    uint32_t    seed;
														
 
															+};
														
 
															+
														
 
															+struct llama_sampling_context* llama_sampling_cinit(struct llama_sampling_cparams *params);
														
 
															+void llama_sampling_cfree(struct llama_sampling_context * ctx);
														
 
															+void llama_sampling_creset(struct llama_sampling_context * ctx);
														
 
															+
														
 
															+llama_token llama_sampling_csample(
														
 
															+        struct llama_sampling_context * ctx_sampling,
														
 
															+        struct llama_context * ctx_main,
														
 
															+        struct llama_context * ctx_cfg,
														
 
															+        int idx);
														
 
															+
														
 
															+void llama_sampling_caccept(
														
 
															+        struct llama_sampling_context * ctx_sampling,
														
 
															+        struct llama_context * ctx_main,
														
 
															+        llama_token id,
														
 
															+        bool apply_grammar);
														
 
															+
														
 
															+#ifdef __cplusplus
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+#endif // LLAMA_SAMPLING_EXT_H