Sfoglia il codice sorgente

llm: update llama.cpp commit to `7c26775` (#4896)

* llm: update llama.cpp submodule to `7c26775`

* disable `LLAMA_BLAS` for now

* `-DLLAMA_OPENMP=off`
Jeffrey Morgan 10 mesi fa
parent
commit
152fc202f5

+ 6 - 6
llm/generate/gen_darwin.sh

@@ -18,7 +18,7 @@ sign() {
     fi
     fi
 }
 }
 
 
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off"
 
 
 case "${GOARCH}" in
 case "${GOARCH}" in
 "amd64")
 "amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     # Static build for linking into the Go binary
     init_vars
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     echo "Building static library"
     build
     build
@@ -37,7 +37,7 @@ case "${GOARCH}" in
         # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
         # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
         #
         #
         init_vars
         init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
         BUILD_DIR="../build/darwin/${ARCH}/cpu"
         BUILD_DIR="../build/darwin/${ARCH}/cpu"
         echo "Building LCD CPU"
         echo "Building LCD CPU"
         build
         build
@@ -49,7 +49,7 @@ case "${GOARCH}" in
         # Approximately 400% faster than LCD on same CPU
         # Approximately 400% faster than LCD on same CPU
         #
         #
         init_vars
         init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
         BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
         BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
         echo "Building AVX CPU"
         echo "Building AVX CPU"
         build
         build
@@ -61,7 +61,7 @@ case "${GOARCH}" in
         # Approximately 10% faster than AVX on same CPU
         # Approximately 10% faster than AVX on same CPU
         #
         #
         init_vars
         init_vars
-        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
         BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
         BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
         echo "Building AVX2 CPU"
         echo "Building AVX2 CPU"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
@@ -75,7 +75,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     # Static build for linking into the Go binary
     init_vars
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     echo "Building static library"
     build
     build

+ 3 - 3
llm/generate/gen_linux.sh

@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
         export CUDACXX=$(command -v nvcc)
         export CUDACXX=$(command -v nvcc)
     fi
     fi
 fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 source $(dirname $0)/gen_common.sh
 init_vars
 init_vars
 git_module_setup
 git_module_setup
@@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ];
     # Static build for linking into the Go binary
     # Static build for linking into the Go binary
     init_vars
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/linux/${ARCH}_static"
     BUILD_DIR="../build/linux/${ARCH}_static"
     echo "Building static library"
     echo "Building static library"
     build
     build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
         # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
         # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
         # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
 
 
-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
+        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off"
         if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
         if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
             #
             #
             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)

+ 4 - 2
llm/generate/gen_windows.ps1

@@ -37,7 +37,8 @@ function init_vars {
     }
     }
     $script:cmakeDefs = @(
     $script:cmakeDefs = @(
         "-DBUILD_SHARED_LIBS=on",
         "-DBUILD_SHARED_LIBS=on",
-        "-DLLAMA_NATIVE=off"
+        "-DLLAMA_NATIVE=off",
+        "-DLLAMA_OPENMP=off"
         )
         )
     $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
     $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
     $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
     $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
@@ -206,7 +207,8 @@ function build_static() {
             "-DLLAMA_AVX2=off",
             "-DLLAMA_AVX2=off",
             "-DLLAMA_AVX512=off",
             "-DLLAMA_AVX512=off",
             "-DLLAMA_F16C=off",
             "-DLLAMA_F16C=off",
-            "-DLLAMA_FMA=off")
+            "-DLLAMA_FMA=off",
+            "-DLLAMA_OPENMP=off")
         $script:buildDir="../build/windows/${script:ARCH}_static"
         $script:buildDir="../build/windows/${script:ARCH}_static"
         write-host "Building static library"
         write-host "Building static library"
         build
         build

+ 1 - 1
llm/llama.cpp

@@ -1 +1 @@
-Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603
+Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c

+ 9 - 9
llm/patches/01-load-progress.diff

@@ -1,8 +1,8 @@
 diff --git a/common/common.cpp b/common/common.cpp
 diff --git a/common/common.cpp b/common/common.cpp
-index ba1ecf0e..cead57cc 100644
+index 73ff0e85..6adb1a92 100644
 --- a/common/common.cpp
 --- a/common/common.cpp
 +++ b/common/common.cpp
 +++ b/common/common.cpp
-@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
      mparams.use_mmap        = params.use_mmap;
      mparams.use_mmap        = params.use_mmap;
      mparams.use_mlock       = params.use_mlock;
      mparams.use_mlock       = params.use_mlock;
      mparams.check_tensors   = params.check_tensors;
      mparams.check_tensors   = params.check_tensors;
@@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644
          mparams.kv_overrides = NULL;
          mparams.kv_overrides = NULL;
      } else {
      } else {
 diff --git a/common/common.h b/common/common.h
 diff --git a/common/common.h b/common/common.h
-index d80344f2..71e84834 100644
+index 58ed72f4..0bb2605e 100644
 --- a/common/common.h
 --- a/common/common.h
 +++ b/common/common.h
 +++ b/common/common.h
-@@ -174,6 +174,13 @@ struct gpt_params {
-     // multimodal models (see examples/llava)
+@@ -180,6 +180,13 @@ struct gpt_params {
      std::string mmproj = "";        // path to multimodal projector
      std::string mmproj = "";        // path to multimodal projector
      std::vector<std::string> image; // path to image file(s)
      std::vector<std::string> image; // path to image file(s)
-+
+ 
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If the provided progress_callback returns true, model loading continues.
 +    // If it returns false, model loading is immediately aborted.
 +    // If it returns false, model loading is immediately aborted.
 +    llama_progress_callback progress_callback = NULL;
 +    llama_progress_callback progress_callback = NULL;
 +    // context pointer passed to the progress callback
 +    // context pointer passed to the progress callback
 +    void * progress_callback_user_data;
 +    void * progress_callback_user_data;
- };
- 
- void gpt_params_handle_model_default(gpt_params & params);
++
+     // server params
+     int32_t port           = 8080;         // server listens on this network port
+     int32_t timeout_read   = 600;          // http read timeout in seconds

+ 8 - 8
llm/patches/05-default-pretokenizer.diff

@@ -1,8 +1,8 @@
 diff --git a/llama.cpp b/llama.cpp
 diff --git a/llama.cpp b/llama.cpp
-index 40d2ec2c..74f3ee9c 100644
+index 61948751..4b72a293 100644
 --- a/llama.cpp
 --- a/llama.cpp
 +++ b/llama.cpp
 +++ b/llama.cpp
-@@ -4642,16 +4642,7 @@ static void llm_load_vocab(
+@@ -4824,16 +4824,7 @@ static void llm_load_vocab(
  
  
          // for now, only BPE models have pre-tokenizers
          // for now, only BPE models have pre-tokenizers
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
@@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 -                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--            } else if (
-+            if (
-                     tokenizer_pre == "default") {
+-            } else if (tokenizer_pre == "default") {
++            if (tokenizer_pre == "default") {
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
              } else if (
-@@ -4703,7 +4694,8 @@ static void llm_load_vocab(
-                 tokenizer_pre == "smaug-bpe") {
-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+                     tokenizer_pre == "llama3"   ||
+@@ -4888,7 +4879,8 @@ static void llm_load_vocab(
+                 tokenizer_pre == "poro-chat") {
+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
              } else {
              } else {
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);