|
@@ -10,20 +10,19 @@ package llm
|
|
|
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
|
|
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
|
|
|
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
|
|
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/common/libcommon.a
|
|
|
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/examples/server/libext_server.a
|
|
|
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libllama.a
|
|
|
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/metal/libggml_static.a
|
|
|
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
|
|
|
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
|
|
|
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
|
|
|
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
|
|
|
#cgo linux CFLAGS: -D_GNU_SOURCE
|
|
|
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
|
|
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
|
|
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
|
|
|
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
|
|
|
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
|
|
|
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
|
|
|
-#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
|
|
-#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/lib
|
|
|
-#cgo windows LDFLAGS: -lext_server_shared -lpthread
|
|
|
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
|
|
|
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
|
|
|
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
|
|
|
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
|
|
|
+#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
|
|
|
+#cgo linux windows LDFLAGS: -lpthread
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
#include "ext_server.h"
|
|
@@ -46,25 +45,6 @@ import (
|
|
|
"github.com/jmorganca/ollama/gpu"
|
|
|
)
|
|
|
|
|
|
-func newExtServerResp(len C.size_t) C.ext_server_resp_t {
|
|
|
- var resp C.ext_server_resp_t
|
|
|
- resp.msg_len = len
|
|
|
- bytes := make([]byte, len)
|
|
|
- resp.msg = (*C.char)(C.CBytes(bytes))
|
|
|
- return resp
|
|
|
-}
|
|
|
-
|
|
|
-func freeExtServerResp(resp C.ext_server_resp_t) {
|
|
|
- if resp.msg_len == 0 {
|
|
|
- return
|
|
|
- }
|
|
|
- C.free(unsafe.Pointer(resp.msg))
|
|
|
-}
|
|
|
-
|
|
|
-func extServerResponseToErr(resp C.ext_server_resp_t) error {
|
|
|
- return fmt.Errorf(C.GoString(resp.msg))
|
|
|
-}
|
|
|
-
|
|
|
type extServer interface {
|
|
|
LLM
|
|
|
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
|
|
@@ -80,52 +60,26 @@ type extServer interface {
|
|
|
llama_server_release_json_resp(json_resp **C.char)
|
|
|
}
|
|
|
|
|
|
-type llamaExtServer struct {
|
|
|
- api.Options
|
|
|
-}
|
|
|
-
|
|
|
// Note: current implementation does not support concurrent instantiations
|
|
|
var mutex sync.Mutex
|
|
|
|
|
|
-func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
|
|
- C.llama_server_init(sparams, err)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_start() {
|
|
|
- C.llama_server_start()
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_stop() {
|
|
|
- C.llama_server_stop()
|
|
|
-}
|
|
|
-
|
|
|
-func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
|
|
- C.llama_server_completion(json_req, resp)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
|
|
- C.llama_server_completion_next_result(task_id, resp)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
|
|
- C.llama_server_completion_cancel(task_id, err)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
|
|
- C.llama_server_release_task_result(result)
|
|
|
+func newExtServerResp(len C.size_t) C.ext_server_resp_t {
|
|
|
+ var resp C.ext_server_resp_t
|
|
|
+ resp.msg_len = len
|
|
|
+ bytes := make([]byte, len)
|
|
|
+ resp.msg = (*C.char)(C.CBytes(bytes))
|
|
|
+ return resp
|
|
|
}
|
|
|
|
|
|
-func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
|
|
- C.llama_server_tokenize(json_req, json_resp, err)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
|
|
- C.llama_server_detokenize(json_req, json_resp, err)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
|
|
- C.llama_server_embedding(json_req, json_resp, err)
|
|
|
-}
|
|
|
-func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
|
|
- C.llama_server_release_json_resp(json_resp)
|
|
|
+func freeExtServerResp(resp C.ext_server_resp_t) {
|
|
|
+ if resp.msg_len == 0 {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ C.free(unsafe.Pointer(resp.msg))
|
|
|
}
|
|
|
|
|
|
-func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
|
|
- server := &llamaExtServer{opts}
|
|
|
- return newExtServer(server, model, adapters, projectors, numLayers, opts)
|
|
|
+func extServerResponseToErr(resp C.ext_server_resp_t) error {
|
|
|
+ return fmt.Errorf(C.GoString(resp.msg))
|
|
|
}
|
|
|
|
|
|
func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
|
@@ -199,10 +153,6 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
|
|
|
return server, nil
|
|
|
}
|
|
|
|
|
|
-func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
|
|
- return predict(llm, llm.Options, ctx, pred, fn)
|
|
|
-}
|
|
|
-
|
|
|
func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
|
|
resp := newExtServerResp(128)
|
|
|
defer freeExtServerResp(resp)
|
|
@@ -326,9 +276,6 @@ func predict(llm extServer, opts api.Options, ctx context.Context, predict Predi
|
|
|
// should never reach here ideally
|
|
|
return fmt.Errorf("max retries exceeded")
|
|
|
}
|
|
|
-func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
|
|
- return encode(llm, ctx, prompt)
|
|
|
-}
|
|
|
|
|
|
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
|
|
|
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
|
@@ -354,10 +301,6 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
|
|
|
return encoded.Tokens, err
|
|
|
}
|
|
|
|
|
|
-func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
|
|
- return decode(llm, ctx, tokens)
|
|
|
-}
|
|
|
-
|
|
|
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
|
|
|
if len(tokens) == 0 {
|
|
|
return "", nil
|
|
@@ -386,9 +329,6 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
|
|
|
return decoded.Content, err
|
|
|
}
|
|
|
|
|
|
-func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
|
|
- return embedding(llm, ctx, input)
|
|
|
-}
|
|
|
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
|
|
|
data, err := json.Marshal(TokenizeRequest{Content: input})
|
|
|
if err != nil {
|
|
@@ -414,10 +354,6 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
|
|
|
return embedding.Embedding, nil
|
|
|
}
|
|
|
|
|
|
-func (llm *llamaExtServer) Close() {
|
|
|
- close(llm)
|
|
|
-}
|
|
|
-
|
|
|
func close(llm extServer) {
|
|
|
llm.llama_server_stop()
|
|
|
mutex.Unlock()
|