Browse Source

runner.go: Support resource usage command line options

Command line options to the runner that control resource usage
(mmap, mlock, tensor split) are used by Ollama but not currently
implemented. This implements support for these while ignoring
others that have no meaning in this context.
Jesse Gross 8 months ago
parent
commit
e4a091bafd
3 changed files with 79 additions and 60 deletions
  1. 8 3
      llama/example/main.go
  2. 42 28
      llama/llama.go
  3. 29 29
      llama/runner/runner.go

+ 8 - 3
llama/example/main.go

@@ -29,9 +29,14 @@ func main() {
 
 
 	// load the model
 	// load the model
 	llama.BackendInit()
 	llama.BackendInit()
-	params := llama.NewModelParams(999, 0, func(p float32) {
-		fmt.Printf("loading... %f\n", p)
-	})
+	params := llama.ModelParams{
+		NumGpuLayers: 999,
+		MainGpu:      0,
+		UseMmap:      true,
+		Progress: func(p float32) {
+			fmt.Printf("loading... %f\n", p)
+		},
+	}
 	model := llama.LoadModelFromFile(*mpath, params)
 	model := llama.LoadModelFromFile(*mpath, params)
 	ctxParams := llama.NewContextParams(2048, runtime.NumCPU(), false)
 	ctxParams := llama.NewContextParams(2048, runtime.NumCPU(), false)
 
 

+ 42 - 28
llama/llama.go

@@ -78,33 +78,6 @@ func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParam
 	return ContextParams{c: params}
 	return ContextParams{c: params}
 }
 }
 
 
-type ModelParams struct {
-	c C.struct_llama_model_params
-}
-
-//export llamaProgressCallback
-func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
-	handle := cgo.Handle(userData)
-	callback := handle.Value().(func(float32))
-	callback(float32(progress))
-	return true
-}
-
-func NewModelParams(numGpuLayers int, mainGpu int, callback func(float32)) ModelParams {
-	params := C.llama_model_default_params()
-	params.n_gpu_layers = C.int(numGpuLayers)
-	params.main_gpu = C.int32_t(mainGpu)
-
-	handle := cgo.NewHandle(callback)
-	params.progress_callback = C.llama_progress_callback(C.llamaProgressCallback)
-	params.progress_callback_user_data = unsafe.Pointer(handle)
-	runtime.SetFinalizer(&params, func(p *C.struct_llama_model_params) {
-		handle.Delete()
-	})
-
-	return ModelParams{c: params}
-}
-
 type Context struct {
 type Context struct {
 	c *C.struct_llama_context
 	c *C.struct_llama_context
 }
 }
@@ -179,8 +152,49 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
 	return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))), c.Model().NEmbd())
 	return unsafe.Slice((*float32)(unsafe.Pointer(C.llama_get_embeddings_ith(c.c, C.int32_t(i)))), c.Model().NEmbd())
 }
 }
 
 
+type ModelParams struct {
+	NumGpuLayers int
+	MainGpu      int
+	UseMmap      bool
+	UseMlock     bool
+	TensorSplit  []float32
+	Progress     func(float32)
+}
+
+//export llamaProgressCallback
+func llamaProgressCallback(progress C.float, userData unsafe.Pointer) C.bool {
+	handle := cgo.Handle(userData)
+	callback := handle.Value().(func(float32))
+	callback(float32(progress))
+	return true
+}
+
 func LoadModelFromFile(modelPath string, params ModelParams) *Model {
 func LoadModelFromFile(modelPath string, params ModelParams) *Model {
-	return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), params.c)}
+	cparams := C.llama_model_default_params()
+	cparams.n_gpu_layers = C.int(params.NumGpuLayers)
+	cparams.main_gpu = C.int32_t(params.MainGpu)
+	cparams.use_mmap = C.bool(params.UseMmap)
+	cparams.use_mlock = C.bool(params.UseMlock)
+
+	if len(params.TensorSplit) > 0 {
+		tensorSplitData := &params.TensorSplit[0]
+
+		var tensorSplitPin runtime.Pinner
+		tensorSplitPin.Pin(tensorSplitData)
+		defer tensorSplitPin.Unpin()
+
+		cparams.tensor_split = (*C.float)(unsafe.Pointer(tensorSplitData))
+	}
+
+	if params.Progress != nil {
+		handle := cgo.NewHandle(params.Progress)
+		defer handle.Delete()
+
+		cparams.progress_callback = C.llama_progress_callback(C.llamaProgressCallback)
+		cparams.progress_callback_user_data = unsafe.Pointer(handle)
+	}
+
+	return &Model{c: C.llama_load_model_from_file(C.CString(modelPath), cparams)}
 }
 }
 
 
 func NewContextWithModel(model *Model, params ContextParams) *Context {
 func NewContextWithModel(model *Model, params ContextParams) *Context {

+ 29 - 29
llama/runner/runner.go

@@ -12,6 +12,7 @@ import (
 	"net/http"
 	"net/http"
 	"os"
 	"os"
 	"path/filepath"
 	"path/filepath"
+	"regexp"
 	"runtime"
 	"runtime"
 	"strconv"
 	"strconv"
 	"strings"
 	"strings"
@@ -599,16 +600,16 @@ func main() {
 	lpath := flag.String("lora", "", "Path to lora layer file")
 	lpath := flag.String("lora", "", "Path to lora layer file")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-
-	// TODO not yet implemented but wired to keep the parsing aligned
-	embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
-	logDisable := flag.Bool("log-disable", false, "disables logging to a file")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
-	f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
 	noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
 
 
+	// These are either ignored by llama.cpp or have no significance to us
+	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
+	_ = flag.Bool("log-disable", false, "disables logging to a file")
+	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
+
 	flag.Parse()
 	flag.Parse()
 	level := slog.LevelInfo
 	level := slog.LevelInfo
 	if *verbose {
 	if *verbose {
@@ -627,26 +628,6 @@ func main() {
 	})
 	})
 	slog.SetDefault(slog.New(handler))
 	slog.SetDefault(slog.New(handler))
 
 
-	// TODO actually implement...
-	if *embedding {
-		slog.Warn("embeddings not yet supported")
-	}
-	if *logDisable {
-		slog.Info("ignoring --log-disable")
-	}
-	if *f32 {
-		slog.Warn("memory-f32 not yet supported")
-	}
-	if *noMmap {
-		slog.Warn("no-mmap not yet supported")
-	}
-	if *mlock {
-		slog.Warn("mlock not yet supported")
-	}
-	if *tensorSplit != "" {
-		slog.Warn("tensor-split not yet implemented")
-	}
-
 	server := &Server{
 	server := &Server{
 		numCtx:    *kvSize / *parallel,
 		numCtx:    *kvSize / *parallel,
 		batchSize: *batchSize,
 		batchSize: *batchSize,
@@ -659,10 +640,29 @@ func main() {
 	// otherwise Ollama can timeout for large model loads
 	// otherwise Ollama can timeout for large model loads
 	// load the model
 	// load the model
 	llama.BackendInit()
 	llama.BackendInit()
-	params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
-		slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
-		server.progress = progress
-	})
+
+	var tensorSplitFloats []float32
+	if *tensorSplit != "" {
+		stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
+
+		tensorSplitFloats = make([]float32, 0, len(stringFloats))
+		for _, s := range stringFloats {
+			f, _ := strconv.ParseFloat(s, 32)
+			tensorSplitFloats = append(tensorSplitFloats, float32(f))
+		}
+	}
+
+	params := llama.ModelParams{
+		NumGpuLayers: *nGpuLayers,
+		MainGpu:      *mainGpu,
+		UseMmap:      !*noMmap && *lpath == "",
+		UseMlock:     *mlock,
+		TensorSplit:  tensorSplitFloats,
+		Progress: func(progress float32) {
+			slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
+			server.progress = progress
+		},
+	}
 	server.model = llama.LoadModelFromFile(*mpath, params)
 	server.model = llama.LoadModelFromFile(*mpath, params)
 
 
 	if *lpath != "" {
 	if *lpath != "" {