|
@@ -30,6 +30,7 @@ import (
|
|
|
"github.com/ollama/ollama/format"
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
|
"github.com/ollama/ollama/llama"
|
|
|
+ "github.com/ollama/ollama/model"
|
|
|
)
|
|
|
|
|
|
type LlamaServer interface {
|
|
@@ -54,8 +55,15 @@ type llmServer struct {
|
|
|
options api.Options
|
|
|
numParallel int
|
|
|
modelPath string
|
|
|
- modelLock sync.Mutex // Temporary until we switch fully to Go server
|
|
|
- model *llama.Model // If non-nil, the runner is a new Go server
|
|
|
+
|
|
|
+ // llamaModel is an instance of the cgo llama.cpp model definition
|
|
|
+ // nil if this server is running the new engine
|
|
|
+ llamaModel *llama.Model
|
|
|
+ llamaModelLock sync.Mutex
|
|
|
+
|
|
|
+ // textProcessor handles text encoding/decoding for the model in the Ollama engine
|
|
|
+ // nil if this server is running the llama.cpp based engine
|
|
|
+ textProcessor model.TextProcessor
|
|
|
|
|
|
estimate MemoryEstimate
|
|
|
totalLayers uint64
|
|
@@ -89,7 +97,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
|
|
|
|
|
// NewLlamaServer will run a server for the given GPUs
|
|
|
// The gpu list must be a single family.
|
|
|
-func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
|
|
+func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
|
|
systemInfo := discover.GetSystemInfo()
|
|
|
systemTotalMemory := systemInfo.System.TotalMemory
|
|
|
systemFreeMemory := systemInfo.System.FreeMemory
|
|
@@ -130,7 +138,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
|
|
|
slog.Info("offload", "", estimate)
|
|
|
|
|
|
params := []string{
|
|
|
- "--model", model,
|
|
|
+ "--model", modelPath,
|
|
|
"--ctx-size", strconv.Itoa(opts.NumCtx),
|
|
|
"--batch-size", strconv.Itoa(opts.NumBatch),
|
|
|
}
|
|
@@ -153,11 +161,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if len(projectors) > 0 {
|
|
|
- // TODO: applying multiple projectors is not supported by the llama.cpp server yet
|
|
|
- params = append(params, "--mmproj", projectors[0])
|
|
|
- }
|
|
|
-
|
|
|
defaultThreads := systemInfo.GetOptimalThreadCount()
|
|
|
if opts.NumThread > 0 {
|
|
|
params = append(params, "--threads", strconv.Itoa(opts.NumThread))
|
|
@@ -257,6 +260,34 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
|
|
|
}
|
|
|
}
|
|
|
slog.Debug("compatible gpu libraries", "compatible", compatible)
|
|
|
+ exe, err := os.Executable()
|
|
|
+ if err != nil {
|
|
|
+ return nil, fmt.Errorf("unable to lookup executable path: %w", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
|
|
+ exe = eval
|
|
|
+ }
|
|
|
+
|
|
|
+ var llamaModel *llama.Model
|
|
|
+ var textProcessor model.TextProcessor
|
|
|
+ if envconfig.NewEngine() {
|
|
|
+ textProcessor, err = model.NewTextProcessor(modelPath)
|
|
|
+ if err != nil {
|
|
|
+ // To prepare for opt-out mode, instead of treating this as an error, we fallback to the old runner
|
|
|
+ slog.Debug("model not yet supported by Ollama engine, switching to compatibility mode", "model", modelPath, "error", err)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if textProcessor == nil {
|
|
|
+ llamaModel, err = llama.LoadModelFromFile(modelPath, llama.ModelParams{VocabOnly: true})
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if len(projectors) > 0 && llamaModel != nil {
|
|
|
+ params = append(params, "--mmproj", projectors[0])
|
|
|
+ }
|
|
|
|
|
|
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
|
|
|
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
|
@@ -275,7 +306,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
|
|
|
port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
|
|
}
|
|
|
finalParams := []string{"runner"}
|
|
|
- if envconfig.NewEngine() {
|
|
|
+ if textProcessor != nil {
|
|
|
+ // New engine
|
|
|
+ // TODO - if we have failure to load scenarios, add logic to retry with the old runner
|
|
|
finalParams = append(finalParams, "--ollama-engine")
|
|
|
}
|
|
|
finalParams = append(finalParams, params...)
|
|
@@ -315,28 +348,20 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
|
|
|
// finally, add the root library path
|
|
|
libraryPaths = append(libraryPaths, discover.LibOllamaPath)
|
|
|
|
|
|
- exe, err := os.Executable()
|
|
|
- if err != nil {
|
|
|
- return nil, fmt.Errorf("unable to lookup executable path: %w", err)
|
|
|
- }
|
|
|
-
|
|
|
- if eval, err := filepath.EvalSymlinks(exe); err == nil {
|
|
|
- exe = eval
|
|
|
- }
|
|
|
-
|
|
|
- // TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
|
|
|
s := &llmServer{
|
|
|
- port: port,
|
|
|
- cmd: exec.Command(exe, finalParams...),
|
|
|
- status: NewStatusWriter(os.Stderr),
|
|
|
- options: opts,
|
|
|
- modelPath: model,
|
|
|
- estimate: estimate,
|
|
|
- numParallel: numParallel,
|
|
|
- sem: semaphore.NewWeighted(int64(numParallel)),
|
|
|
- totalLayers: f.KV().BlockCount() + 1,
|
|
|
- gpus: gpus,
|
|
|
- done: make(chan error, 1),
|
|
|
+ port: port,
|
|
|
+ cmd: exec.Command(exe, finalParams...),
|
|
|
+ status: NewStatusWriter(os.Stderr),
|
|
|
+ options: opts,
|
|
|
+ modelPath: modelPath,
|
|
|
+ llamaModel: llamaModel,
|
|
|
+ textProcessor: textProcessor,
|
|
|
+ estimate: estimate,
|
|
|
+ numParallel: numParallel,
|
|
|
+ sem: semaphore.NewWeighted(int64(numParallel)),
|
|
|
+ totalLayers: f.KV().BlockCount() + 1,
|
|
|
+ gpus: gpus,
|
|
|
+ done: make(chan error, 1),
|
|
|
}
|
|
|
|
|
|
s.cmd.Env = os.Environ()
|
|
@@ -405,6 +430,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapt
|
|
|
}
|
|
|
err := fmt.Errorf("error starting runner: %v %s", err, msg)
|
|
|
if len(compatible) == 0 {
|
|
|
+ if llamaModel != nil {
|
|
|
+ llama.FreeModel(llamaModel)
|
|
|
+ }
|
|
|
return nil, err
|
|
|
}
|
|
|
|
|
@@ -701,24 +729,29 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
|
|
}
|
|
|
|
|
|
if len(req.Format) > 0 {
|
|
|
- switch string(req.Format) {
|
|
|
- case `null`, `""`:
|
|
|
- // Field was set, but "missing" a value. We accept
|
|
|
- // these as "not set".
|
|
|
- break
|
|
|
- case `"json"`:
|
|
|
- request["grammar"] = grammarJSON
|
|
|
- default:
|
|
|
- if req.Format[0] != '{' {
|
|
|
- return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
|
|
|
- }
|
|
|
+ format := string(req.Format)
|
|
|
+ if format != `null` && format != `""` {
|
|
|
+ if s.textProcessor != nil {
|
|
|
+ // New engine handles this on the backend
|
|
|
+ request["format"] = req.Format
|
|
|
+ } else {
|
|
|
+ // old engine
|
|
|
+ switch format {
|
|
|
+ case `"json"`:
|
|
|
+ request["grammar"] = grammarJSON
|
|
|
+ default:
|
|
|
+ if req.Format[0] != '{' {
|
|
|
+ return fmt.Errorf("invalid format: %q; expected \"json\" or a valid JSON Schema object", req.Format)
|
|
|
+ }
|
|
|
|
|
|
- // User provided a JSON schema
|
|
|
- g := llama.SchemaToGrammar(req.Format)
|
|
|
- if g == nil {
|
|
|
- return fmt.Errorf("invalid JSON schema in format")
|
|
|
+ // User provided a JSON schema
|
|
|
+ g := llama.SchemaToGrammar(req.Format)
|
|
|
+ if g == nil {
|
|
|
+ return fmt.Errorf("invalid JSON schema in format")
|
|
|
+ }
|
|
|
+ request["grammar"] = string(g)
|
|
|
+ }
|
|
|
}
|
|
|
- request["grammar"] = string(g)
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -933,64 +966,25 @@ type TokenizeResponse struct {
|
|
|
}
|
|
|
|
|
|
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
|
|
- s.modelLock.Lock()
|
|
|
- defer s.modelLock.Unlock()
|
|
|
- if s.model != nil {
|
|
|
- return s.model.Tokenize(content, false, true)
|
|
|
- }
|
|
|
+ s.llamaModelLock.Lock()
|
|
|
+ defer s.llamaModelLock.Unlock()
|
|
|
|
|
|
- // Make sure the server is ready
|
|
|
- status, err := s.getServerStatus(ctx)
|
|
|
- if err != nil {
|
|
|
- return nil, err
|
|
|
- } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
|
|
- return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
|
|
- }
|
|
|
-
|
|
|
- data, err := json.Marshal(TokenizeRequest{Content: content})
|
|
|
- if err != nil {
|
|
|
- return nil, fmt.Errorf("marshaling encode data: %w", err)
|
|
|
- }
|
|
|
-
|
|
|
- req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
|
|
|
- if err != nil {
|
|
|
- return nil, fmt.Errorf("encode request: %w", err)
|
|
|
- }
|
|
|
- req.Header.Set("Content-Type", "application/json")
|
|
|
-
|
|
|
- resp, err := http.DefaultClient.Do(req)
|
|
|
- if err != nil {
|
|
|
- return nil, fmt.Errorf("do encode request: %w", err)
|
|
|
+ if s.llamaModel != nil {
|
|
|
+ return s.llamaModel.Tokenize(content, false, true)
|
|
|
}
|
|
|
- defer resp.Body.Close()
|
|
|
- if resp.StatusCode == http.StatusNotFound {
|
|
|
- if s.model == nil {
|
|
|
- slog.Debug("new runner detected, loading model for cgo tokenization")
|
|
|
- m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
|
|
|
- if err != nil {
|
|
|
- return nil, err
|
|
|
- }
|
|
|
- s.model = m
|
|
|
+ if s.textProcessor != nil {
|
|
|
+ tokens, err := s.textProcessor.Encode(content)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
}
|
|
|
- return s.model.Tokenize(content, false, true)
|
|
|
- }
|
|
|
-
|
|
|
- body, err := io.ReadAll(resp.Body)
|
|
|
- if err != nil {
|
|
|
- return nil, fmt.Errorf("read encode request: %w", err)
|
|
|
- }
|
|
|
-
|
|
|
- if resp.StatusCode >= 400 {
|
|
|
- log.Printf("llm encode error: %s", body)
|
|
|
- return nil, fmt.Errorf("%s", body)
|
|
|
- }
|
|
|
-
|
|
|
- var encoded TokenizeResponse
|
|
|
- if err := json.Unmarshal(body, &encoded); err != nil {
|
|
|
- return nil, fmt.Errorf("unmarshal encode response: %w", err)
|
|
|
+ toks := make([]int, len(tokens))
|
|
|
+ for i, t := range tokens {
|
|
|
+ toks[i] = int(t)
|
|
|
+ }
|
|
|
+ return toks, nil
|
|
|
}
|
|
|
-
|
|
|
- return encoded.Tokens, nil
|
|
|
+ // not reached
|
|
|
+ return nil, fmt.Errorf("no tokenizer configured")
|
|
|
}
|
|
|
|
|
|
type DetokenizeRequest struct {
|
|
@@ -1002,80 +996,38 @@ type DetokenizeResponse struct {
|
|
|
}
|
|
|
|
|
|
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
|
|
- s.modelLock.Lock()
|
|
|
- defer s.modelLock.Unlock()
|
|
|
- if s.model != nil {
|
|
|
+ s.llamaModelLock.Lock()
|
|
|
+ defer s.llamaModelLock.Unlock()
|
|
|
+
|
|
|
+ if s.llamaModel != nil {
|
|
|
var resp string
|
|
|
for _, token := range tokens {
|
|
|
- resp += s.model.TokenToPiece(token)
|
|
|
+ resp += s.llamaModel.TokenToPiece(token)
|
|
|
}
|
|
|
return resp, nil
|
|
|
}
|
|
|
- // Make sure the server is ready
|
|
|
- status, err := s.getServerStatus(ctx)
|
|
|
- if err != nil {
|
|
|
- return "", err
|
|
|
- } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
|
|
- return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
|
|
- }
|
|
|
-
|
|
|
- data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
|
|
- if err != nil {
|
|
|
- return "", fmt.Errorf("marshaling decode data: %w", err)
|
|
|
- }
|
|
|
-
|
|
|
- req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
|
|
|
- if err != nil {
|
|
|
- return "", fmt.Errorf("decode request: %w", err)
|
|
|
- }
|
|
|
- req.Header.Set("Content-Type", "application/json")
|
|
|
-
|
|
|
- resp, err := http.DefaultClient.Do(req)
|
|
|
- if err != nil {
|
|
|
- return "", fmt.Errorf("do decode request: %w", err)
|
|
|
- }
|
|
|
- defer resp.Body.Close()
|
|
|
- if resp.StatusCode == http.StatusNotFound {
|
|
|
- if s.model == nil {
|
|
|
- slog.Debug("new runner detected, loading model for cgo tokenization")
|
|
|
- m, err := llama.LoadModelFromFile(s.modelPath, llama.ModelParams{VocabOnly: true})
|
|
|
- if err != nil {
|
|
|
- return "", err
|
|
|
- }
|
|
|
- s.model = m
|
|
|
+ if s.textProcessor != nil {
|
|
|
+ toks := make([]int32, len(tokens))
|
|
|
+ for i, t := range tokens {
|
|
|
+ toks[i] = int32(t)
|
|
|
}
|
|
|
- var resp string
|
|
|
- for _, token := range tokens {
|
|
|
- resp += s.model.TokenToPiece(token)
|
|
|
+ content, err := s.textProcessor.Decode(toks)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
}
|
|
|
- return resp, nil
|
|
|
- }
|
|
|
-
|
|
|
- body, err := io.ReadAll(resp.Body)
|
|
|
- if err != nil {
|
|
|
- return "", fmt.Errorf("read decode request: %w", err)
|
|
|
+ return content, nil
|
|
|
}
|
|
|
-
|
|
|
- if resp.StatusCode >= 400 {
|
|
|
- log.Printf("llm decode error: %s", body)
|
|
|
- return "", fmt.Errorf("%s", body)
|
|
|
- }
|
|
|
-
|
|
|
- var decoded DetokenizeResponse
|
|
|
- if err := json.Unmarshal(body, &decoded); err != nil {
|
|
|
- return "", fmt.Errorf("unmarshal encode response: %w", err)
|
|
|
- }
|
|
|
-
|
|
|
- return decoded.Content, nil
|
|
|
+ // not reached
|
|
|
+ return "", fmt.Errorf("no tokenizer configured")
|
|
|
}
|
|
|
|
|
|
func (s *llmServer) Close() error {
|
|
|
- s.modelLock.Lock()
|
|
|
- if s.model != nil {
|
|
|
- llama.FreeModel(s.model)
|
|
|
- s.model = nil
|
|
|
+ s.llamaModelLock.Lock()
|
|
|
+ if s.llamaModel != nil {
|
|
|
+ llama.FreeModel(s.llamaModel)
|
|
|
+ s.llamaModel = nil
|
|
|
}
|
|
|
- s.modelLock.Unlock()
|
|
|
+ s.llamaModelLock.Unlock()
|
|
|
|
|
|
if s.cmd != nil {
|
|
|
slog.Debug("stopping llama server")
|