6 kuukautta sitten · 312d9de1d1
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -88,6 +88,7 @@ import (
 
				 	"fmt"
			
 
				 	"runtime"
			
 
				 	"runtime/cgo"
			
 
				+	"slices"
			
 
				 	"strings"
			
 
				 	"unsafe"
			
 
				 )
			
@@ -260,7 +261,7 @@ func LoadModelFromFile(modelPath string, params ModelParams) (*Model, error) {
 
				 	}
			
 
				 
			
 
				 	m := Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
			
 
				-	if m.c == (*C.struct_llama_model)(C.NULL) {
			
 
				+	if m.c == nil {
			
 
				 		return nil, fmt.Errorf("unable to load model: %s", modelPath)
			
 
				 	}
			
 
				 
			
@@ -276,7 +277,7 @@ func NewContextWithModel(model *Model, params ContextParams) (*Context, error) {
 
				 		c:          C.llama_new_context_with_model(model.c, params.c),
			
 
				 		numThreads: int(params.c.n_threads),
			
 
				 	}
			
 
				-	if c.c == (*C.struct_llama_context)(C.NULL) {
			
 
				+	if c.c == nil {
			
 
				 		return nil, errors.New("unable to create llama context")
			
 
				 	}
			
 
				 
			
@@ -300,6 +301,9 @@ func (m *Model) ApplyLoraFromFile(context *Context, loraPath string, scale float
 
				 	defer C.free(unsafe.Pointer(cLoraPath))
			
 
				 
			
 
				 	loraAdapter := C.llama_lora_adapter_init(m.c, cLoraPath)
			
 
				+	if loraAdapter == nil {
			
 
				+		return errors.New("unable to load lora")
			
 
				+	}
			
 
				 
			
 
				 	err := -1
			
 
				 	if loraAdapter != nil {
			
@@ -322,13 +326,25 @@ type Batch struct {
 
				 // Creates a new batch for either word tokens or image embeddings (if embedSize is non-zero).
			
 
				 // Batches cannot contain both types at the same time. batchSize is the maximum number of entries
			
 
				 // that can be added per sequence
			
 
				-func NewBatch(batchSize int, maxSeq int, embedSize int) *Batch {
			
 
				-	return &Batch{
			
 
				+func NewBatch(batchSize int, maxSeq int, embedSize int) (*Batch, error) {
			
 
				+	b := Batch{
			
 
				 		c:         C.llama_batch_init(C.int(batchSize*maxSeq), C.int(embedSize), C.int(maxSeq)),
			
 
				 		batchSize: batchSize,
			
 
				 		maxSeq:    maxSeq,
			
 
				 		embedSize: embedSize,
			
 
				 	}
			
 
				+
			
 
				+	// Check to see if any of the allocations in llama_batch_init() failed
			
 
				+	nilPointer := (embedSize == 0 && b.c.token == nil) || (embedSize != 0 && b.c.embd == nil) ||
			
 
				+		b.c.pos == nil || b.c.n_seq_id == nil || b.c.seq_id == nil || b.c.logits == nil ||
			
 
				+		slices.Contains(unsafe.Slice(b.c.seq_id, b.allocSize()), nil)
			
 
				+
			
 
				+	if nilPointer {
			
 
				+		C.llama_batch_free(b.c)
			
 
				+		return nil, fmt.Errorf("unable to allocate batch (batchSize=%v maxSeq=%v embedSize=%v)", batchSize, maxSeq, embedSize)
			
 
				+	}
			
 
				+
			
 
				+	return &b, nil
			
 
				 }
			
 
				 
			
 
				 func (b *Batch) Size() int {
			
@@ -484,6 +500,9 @@ func NewClipContext(llamaContext *Context, modelPath string) (*ClipContext, erro
 
				 	mp := C.CString(modelPath)
			
 
				 	defer C.free(unsafe.Pointer(mp))
			
 
				 	c := C.clip_model_load(mp, 1)
			
 
				+	if c == nil {
			
 
				+		return nil, fmt.Errorf("unable to load clip model: %v", modelPath)
			
 
				+	}
			
 
				 
			
 
				 	projEmbedSize := int(C.clip_n_mmproj_embd(c))
			
 
				 	modelEmbedSize := llamaContext.Model().NEmbd()
			
@@ -498,8 +517,11 @@ func (c *ClipContext) Free() {
 
				 	C.clip_free(c.c)
			
 
				 }
			
 
				 
			
 
				-func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 {
			
 
				+func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) ([][]float32, error) {
			
 
				 	l := C.llava_image_embed_make_with_bytes(c.c, C.int(llamaContext.numThreads), (*C.uchar)(unsafe.Pointer(&data[0])), C.int(len(data)))
			
 
				+	if l == nil {
			
 
				+		return nil, errors.New("unable to make llava embedding from image")
			
 
				+	}
			
 
				 
			
 
				 	numTokens := int(l.n_image_pos)
			
 
				 	numEmbed := llamaContext.Model().NEmbd()
			
@@ -516,7 +538,7 @@ func (c *ClipContext) NewEmbed(llamaContext *Context, data []byte) [][]float32 {
 
				 
			
 
				 	C.llava_image_embed_free(l)
			
 
				 
			
 
				-	return embed
			
 
				+	return embed, nil
			
 
				 }
			
 
				 
			
 
				 type MllamaContext struct {
			
@@ -527,6 +549,9 @@ func NewMllamaContext(llamaContext *Context, modelPath string) (*MllamaContext,
 
				 	mp := C.CString(modelPath)
			
 
				 	defer C.free(unsafe.Pointer(mp))
			
 
				 	c := C.mllama_model_load(mp, 1)
			
 
				+	if c == nil {
			
 
				+		return nil, fmt.Errorf("unable to load mllama model: %v", modelPath)
			
 
				+	}
			
 
				 
			
 
				 	projEmbedSize := int(C.mllama_n_embd(c))
			
 
				 	modelEmbedSize := llamaContext.Model().NEmbd()
			
@@ -541,19 +566,25 @@ func (m *MllamaContext) Free() {
 
				 	C.mllama_free(m.c)
			
 
				 }
			
 
				 
			
 
				-func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) [][]float32 {
			
 
				+func (m *MllamaContext) NewEmbed(llamaContext *Context, data []byte, aspectRatioId int) ([][]float32, error) {
			
 
				 	img := C.mllama_image_init()
			
 
				 	defer C.mllama_image_free(img)
			
 
				 
			
 
				-	C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img)
			
 
				+	ok := bool(C.mllama_image_load_from_data(unsafe.Pointer(&data[0]), C.int(len(data)), 560, 560, 3, 4, C.int(aspectRatioId), img))
			
 
				+	if !ok {
			
 
				+		return nil, errors.New("unable to load mllama image data")
			
 
				+	}
			
 
				 
			
 
				 	rows := make([]float32, m.EmbedSize(llamaContext))
			
 
				-	C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0])))
			
 
				+	ok = bool(C.mllama_image_encode(m.c, C.int(llamaContext.numThreads), img, (*C.float)(unsafe.Pointer(&rows[0]))))
			
 
				+	if !ok {
			
 
				+		return nil, errors.New("unable to make mllama embedding from image")
			
 
				+	}
			
 
				 
			
 
				 	embed := make([][]float32, 1)
			
 
				 	embed[0] = rows
			
 
				 
			
 
				-	return embed
			
 
				+	return embed, nil
			
 
				 }
			
 
				 
			
 
				 func (m *MllamaContext) EmbedSize(llamaContext *Context) int {
			
@@ -592,7 +623,7 @@ type SamplingParams struct {
 
				 	Grammar        string
			
 
				 }
			
 
				 
			
 
				-func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
			
 
				+func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext, error) {
			
 
				 	var cparams C.struct_gpt_sampler_cparams
			
 
				 	cparams.top_k = C.int32_t(params.TopK)
			
 
				 	cparams.top_p = C.float(params.TopP)
			
@@ -615,9 +646,13 @@ func NewSamplingContext(model *Model, params SamplingParams) *SamplingContext {
 
				 
			
 
				 	cparams.grammar = grammar
			
 
				 	context := &SamplingContext{c: C.gpt_sampler_cinit(model.c, &cparams)}
			
 
				+	if context.c == nil {
			
 
				+		return nil, errors.New("unable to create sampling context")
			
 
				+	}
			
 
				+
			
 
				 	runtime.SetFinalizer(context, func(s *SamplingContext) { C.gpt_sampler_cfree(s.c) })
			
 
				 
			
 
				-	return context
			
 
				+	return context, nil
			
 
				 }
			
 
				 
			
 
				 func (s *SamplingContext) Reset() {
			
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@@ -63,9 +63,9 @@ func (c *ImageContext) Free(modelPath string) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) [][]float32 {
			
 
				+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
			
 
				 	if c == nil {
			
 
				-		return nil
			
 
				+		return nil, nil
			
 
				 	}
			
 
				 
			
 
				 	hash := c.hashImage(data)
			
@@ -76,17 +76,23 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect
 
				 	embed, err := c.findImage(hash)
			
 
				 	if err != nil {
			
 
				 		if c.mllama != nil {
			
 
				-			embed = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
			
 
				+			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
			
 
				+			if err != nil {
			
 
				+				return nil, err
			
 
				+			}
			
 
				 		} else if c.clip != nil {
			
 
				-			embed = c.clip.NewEmbed(llamaContext, data)
			
 
				+			embed, err = c.clip.NewEmbed(llamaContext, data)
			
 
				+			if err != nil {
			
 
				+				return nil, err
			
 
				+			}
			
 
				 		} else {
			
 
				-			return nil
			
 
				+			return nil, errors.New("received image but vision model not loaded")
			
 
				 		}
			
 
				 
			
 
				 		c.addImage(hash, embed)
			
 
				 	}
			
 
				 
			
 
				-	return embed
			
 
				+	return embed, nil
			
 
				 }
			
 
				 
			
 
				 func (c *ImageContext) BatchSize(configuredBatchSize int) int {
			
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -131,7 +131,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 
				 
			
 
				 	var sc *llama.SamplingContext
			
 
				 	if params.samplingParams != nil {
			
 
				-		sc = llama.NewSamplingContext(s.model, *params.samplingParams)
			
 
				+		sc, err = llama.NewSamplingContext(s.model, *params.samplingParams)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				 		for _, input := range inputs {
			
 
				 			if input.embed == nil {
			
 
				 				sc.Accept(input.token, false)
			
@@ -194,7 +197,11 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 
				 				return nil, fmt.Errorf("invalid image index: %d", n)
			
 
				 			}
			
 
				 
			
 
				-			embed := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
			
 
				+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
			
 
				+			if err != nil {
			
 
				+				return nil, err
			
 
				+			}
			
 
				+
			
 
				 			for _, e := range embed {
			
 
				 				inputs = append(inputs, input{embed: e})
			
 
				 			}
			
@@ -305,13 +312,19 @@ func (s *Server) run(ctx context.Context) {
 
				 
			
 
				 	// Logically these batches are used only within the context of processBatch
			
 
				 	// but it is better for performance to allocate them once here
			
 
				-	tokenBatch := llama.NewBatch(s.batchSize, len(s.seqs), 0)
			
 
				+	tokenBatch, err := llama.NewBatch(s.batchSize, len(s.seqs), 0)
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				 	defer tokenBatch.Free()
			
 
				 
			
 
				 	var embedBatch *llama.Batch
			
 
				 	embedBatchSize := s.image.BatchSize(s.batchSize)
			
 
				 	if embedBatchSize != 0 {
			
 
				-		embedBatch = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
			
 
				+		embedBatch, err = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
			
 
				+		if err != nil {
			
 
				+			panic(err)
			
 
				+		}
			
 
				 		defer embedBatch.Free()
			
 
				 	} else {
			
 
				 		embedBatch = &llama.Batch{}
			
--- a/llama/sampling_ext.cpp
+++ b/llama/sampling_ext.cpp
@@ -5,24 +5,28 @@
 
				 struct gpt_sampler *gpt_sampler_cinit(
			
 
				     const struct llama_model *model, struct gpt_sampler_cparams *params)
			
 
				 {
			
 
				-    gpt_sampler_params sparams;
			
 
				-    sparams.top_k = params->top_k;
			
 
				-    sparams.top_p = params->top_p;
			
 
				-    sparams.min_p = params->min_p;
			
 
				-    sparams.tfs_z = params->tfs_z;
			
 
				-    sparams.typ_p = params->typical_p;
			
 
				-    sparams.temp = params->temp;
			
 
				-    sparams.penalty_last_n = params->penalty_last_n;
			
 
				-    sparams.penalty_repeat = params->penalty_repeat;
			
 
				-    sparams.penalty_freq = params->penalty_freq;
			
 
				-    sparams.penalty_present = params->penalty_present;
			
 
				-    sparams.mirostat = params->mirostat;
			
 
				-    sparams.mirostat_tau = params->mirostat_tau;
			
 
				-    sparams.mirostat_eta = params->mirostat_eta;
			
 
				-    sparams.penalize_nl = params->penalize_nl;
			
 
				-    sparams.seed = params->seed;
			
 
				-    sparams.grammar = params->grammar;
			
 
				-    return gpt_sampler_init(model, sparams);
			
 
				+    try {
			
 
				+        gpt_sampler_params sparams;
			
 
				+        sparams.top_k = params->top_k;
			
 
				+        sparams.top_p = params->top_p;
			
 
				+        sparams.min_p = params->min_p;
			
 
				+        sparams.tfs_z = params->tfs_z;
			
 
				+        sparams.typ_p = params->typical_p;
			
 
				+        sparams.temp = params->temp;
			
 
				+        sparams.penalty_last_n = params->penalty_last_n;
			
 
				+        sparams.penalty_repeat = params->penalty_repeat;
			
 
				+        sparams.penalty_freq = params->penalty_freq;
			
 
				+        sparams.penalty_present = params->penalty_present;
			
 
				+        sparams.mirostat = params->mirostat;
			
 
				+        sparams.mirostat_tau = params->mirostat_tau;
			
 
				+        sparams.mirostat_eta = params->mirostat_eta;
			
 
				+        sparams.penalize_nl = params->penalize_nl;
			
 
				+        sparams.seed = params->seed;
			
 
				+        sparams.grammar = params->grammar;
			
 
				+        return gpt_sampler_init(model, sparams);
			
 
				+    } catch (const std::exception & err) {
			
 
				+        return nullptr;
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 void gpt_sampler_cfree(struct gpt_sampler *sampler)