|
@@ -1,5 +1,5 @@
|
|
/**
|
|
/**
|
|
- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
|
|
|
|
|
|
+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
|
|
*
|
|
*
|
|
* MIT License
|
|
* MIT License
|
|
*
|
|
*
|
|
@@ -82,8 +82,14 @@
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
#endif
|
|
#endif
|
|
|
|
|
|
|
|
+#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
|
|
|
+#include "ggml-alloc.h"
|
|
|
|
+#define LLAMA_USE_ALLOCATOR
|
|
|
|
+#else
|
|
#define LLAMA_USE_SCRATCH
|
|
#define LLAMA_USE_SCRATCH
|
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
|
|
// available llama models
|
|
// available llama models
|
|
enum e_model {
|
|
enum e_model {
|
|
@@ -353,13 +359,22 @@ struct llama_model {
|
|
|
|
|
|
struct llama_context {
|
|
struct llama_context {
|
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
|
-#ifdef GGML_USE_METAL
|
|
|
|
~llama_context() {
|
|
~llama_context() {
|
|
|
|
+ if (model_owner) {
|
|
|
|
+ delete &model;
|
|
|
|
+ }
|
|
|
|
+#ifdef GGML_USE_METAL
|
|
if (ctx_metal) {
|
|
if (ctx_metal) {
|
|
ggml_metal_free(ctx_metal);
|
|
ggml_metal_free(ctx_metal);
|
|
}
|
|
}
|
|
- }
|
|
|
|
#endif
|
|
#endif
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ if (alloc) {
|
|
|
|
+ ggml_allocr_free(alloc);
|
|
|
|
+ }
|
|
|
|
+#endif
|
|
|
|
+ }
|
|
|
|
+
|
|
std::mt19937 rng;
|
|
std::mt19937 rng;
|
|
|
|
|
|
bool has_evaluated_once = false;
|
|
bool has_evaluated_once = false;
|
|
@@ -397,7 +412,17 @@ struct llama_context {
|
|
// memory buffers used to evaluate the model
|
|
// memory buffers used to evaluate the model
|
|
// TODO: move in llama_state
|
|
// TODO: move in llama_state
|
|
llama_ctx_buffer buf_compute;
|
|
llama_ctx_buffer buf_compute;
|
|
|
|
+
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ llama_ctx_buffer buf_alloc;
|
|
|
|
+ ggml_allocr * alloc = NULL;
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+#ifdef LLAMA_USE_SCRATCH
|
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
|
|
|
+ int buf_last = 0;
|
|
|
|
+ size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
|
|
|
+#endif
|
|
|
|
|
|
#ifdef GGML_USE_METAL
|
|
#ifdef GGML_USE_METAL
|
|
ggml_metal_context * ctx_metal = NULL;
|
|
ggml_metal_context * ctx_metal = NULL;
|
|
@@ -407,9 +432,6 @@ struct llama_context {
|
|
ggml_mpi_context * ctx_mpi = NULL;
|
|
ggml_mpi_context * ctx_mpi = NULL;
|
|
#endif
|
|
#endif
|
|
|
|
|
|
- int buf_last = 0;
|
|
|
|
- size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
|
|
|
-
|
|
|
|
void use_buf(struct ggml_context * ctx, int i) {
|
|
void use_buf(struct ggml_context * ctx, int i) {
|
|
#if defined(LLAMA_USE_SCRATCH)
|
|
#if defined(LLAMA_USE_SCRATCH)
|
|
size_t last_size = 0;
|
|
size_t last_size = 0;
|
|
@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() {
|
|
/*.progress_callback =*/ nullptr,
|
|
/*.progress_callback =*/ nullptr,
|
|
/*.progress_callback_user_data =*/ nullptr,
|
|
/*.progress_callback_user_data =*/ nullptr,
|
|
/*.low_vram =*/ false,
|
|
/*.low_vram =*/ false,
|
|
|
|
+ /*.mul_mat_q =*/ false,
|
|
/*.f16_kv =*/ true,
|
|
/*.f16_kv =*/ true,
|
|
/*.logits_all =*/ false,
|
|
/*.logits_all =*/ false,
|
|
/*.vocab_only =*/ false,
|
|
/*.vocab_only =*/ false,
|
|
@@ -1032,6 +1055,7 @@ static void llama_model_load_internal(
|
|
int n_gpu_layers,
|
|
int n_gpu_layers,
|
|
int main_gpu,
|
|
int main_gpu,
|
|
const float * tensor_split,
|
|
const float * tensor_split,
|
|
|
|
+ const bool mul_mat_q,
|
|
float rope_freq_base,
|
|
float rope_freq_base,
|
|
float rope_freq_scale,
|
|
float rope_freq_scale,
|
|
bool low_vram,
|
|
bool low_vram,
|
|
@@ -1160,9 +1184,11 @@ static void llama_model_load_internal(
|
|
}
|
|
}
|
|
|
|
|
|
(void) main_gpu;
|
|
(void) main_gpu;
|
|
|
|
+ (void) mul_mat_q;
|
|
#if defined(GGML_USE_CUBLAS)
|
|
#if defined(GGML_USE_CUBLAS)
|
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
|
ggml_cuda_set_main_device(main_gpu);
|
|
ggml_cuda_set_main_device(main_gpu);
|
|
|
|
+ ggml_cuda_set_mul_mat_q(mul_mat_q);
|
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
@@ -1256,12 +1282,16 @@ static void llama_model_load_internal(
|
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
|
|
|
|
// this is the total memory required to run the inference
|
|
// this is the total memory required to run the inference
|
|
- const size_t mem_required =
|
|
|
|
|
|
+ size_t mem_required =
|
|
ctx_size +
|
|
ctx_size +
|
|
- mmapped_size - vram_weights + // weights in VRAM not in memory
|
|
|
|
|
|
+ mmapped_size - vram_weights; // weights in VRAM not in memory
|
|
|
|
+
|
|
|
|
+#ifndef LLAMA_USE_ALLOCATOR
|
|
|
|
+ mem_required +=
|
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
|
MEM_REQ_SCRATCH1().at(model.type) +
|
|
MEM_REQ_SCRATCH1().at(model.type) +
|
|
MEM_REQ_EVAL().at(model.type);
|
|
MEM_REQ_EVAL().at(model.type);
|
|
|
|
+#endif
|
|
|
|
|
|
// this is the memory required by one llama_state
|
|
// this is the memory required by one llama_state
|
|
const size_t mem_required_state =
|
|
const size_t mem_required_state =
|
|
@@ -1367,6 +1397,7 @@ static bool llama_model_load(
|
|
int n_gpu_layers,
|
|
int n_gpu_layers,
|
|
int main_gpu,
|
|
int main_gpu,
|
|
const float * tensor_split,
|
|
const float * tensor_split,
|
|
|
|
+ const bool mul_mat_q,
|
|
float rope_freq_base,
|
|
float rope_freq_base,
|
|
float rope_freq_scale,
|
|
float rope_freq_scale,
|
|
bool low_vram,
|
|
bool low_vram,
|
|
@@ -1377,7 +1408,8 @@ static bool llama_model_load(
|
|
llama_progress_callback progress_callback,
|
|
llama_progress_callback progress_callback,
|
|
void *progress_callback_user_data) {
|
|
void *progress_callback_user_data) {
|
|
try {
|
|
try {
|
|
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
|
|
|
|
|
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
|
|
|
+ main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
|
return true;
|
|
return true;
|
|
} catch (const std::exception & err) {
|
|
} catch (const std::exception & err) {
|
|
@@ -1386,32 +1418,15 @@ static bool llama_model_load(
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-// evaluate the transformer
|
|
|
|
-//
|
|
|
|
-// - lctx: llama context
|
|
|
|
-// - tokens: new batch of tokens to process
|
|
|
|
-// - embd embeddings input
|
|
|
|
-// - n_tokens number of tokens
|
|
|
|
-// - n_past: the context size so far
|
|
|
|
-// - n_threads: number of threads to use
|
|
|
|
-//
|
|
|
|
-static bool llama_eval_internal(
|
|
|
|
|
|
+static struct ggml_cgraph * llama_build_graph(
|
|
llama_context & lctx,
|
|
llama_context & lctx,
|
|
const llama_token * tokens,
|
|
const llama_token * tokens,
|
|
const float * embd,
|
|
const float * embd,
|
|
int n_tokens,
|
|
int n_tokens,
|
|
- int n_past,
|
|
|
|
- int n_threads,
|
|
|
|
- const char * cgraph_fname) {
|
|
|
|
|
|
+ int n_past) {
|
|
|
|
|
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
|
|
|
|
|
-#ifdef GGML_USE_MPI
|
|
|
|
- ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- const int64_t t_start_us = ggml_time_us();
|
|
|
|
-
|
|
|
|
const int N = n_tokens;
|
|
const int N = n_tokens;
|
|
|
|
|
|
const auto & model = lctx.model;
|
|
const auto & model = lctx.model;
|
|
@@ -1427,10 +1442,8 @@ static bool llama_eval_internal(
|
|
const int64_t n_head = hparams.n_head;
|
|
const int64_t n_head = hparams.n_head;
|
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
const int64_t n_embd_head = hparams.n_embd_head();
|
|
const int64_t n_embd_head = hparams.n_embd_head();
|
|
- const int64_t n_vocab = hparams.n_vocab;
|
|
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
|
|
|
|
-
|
|
|
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
|
|
|
|
|
const float freq_base = hparams.rope_freq_base;
|
|
const float freq_base = hparams.rope_freq_base;
|
|
@@ -1442,26 +1455,35 @@ static bool llama_eval_internal(
|
|
auto & mem_per_token = lctx.mem_per_token;
|
|
auto & mem_per_token = lctx.mem_per_token;
|
|
auto & buf_compute = lctx.buf_compute;
|
|
auto & buf_compute = lctx.buf_compute;
|
|
|
|
|
|
|
|
+
|
|
struct ggml_init_params params = {
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ buf_compute.size,
|
|
/*.mem_size =*/ buf_compute.size,
|
|
/*.mem_buffer =*/ buf_compute.addr,
|
|
/*.mem_buffer =*/ buf_compute.addr,
|
|
/*.no_alloc =*/ false,
|
|
/*.no_alloc =*/ false,
|
|
};
|
|
};
|
|
|
|
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ params.no_alloc = true;
|
|
|
|
+#endif
|
|
|
|
+
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
|
|
|
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
|
|
|
|
- // for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
|
|
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
|
|
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
|
|
|
-
|
|
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * inpL;
|
|
struct ggml_tensor * inpL;
|
|
|
|
|
|
if (tokens) {
|
|
if (tokens) {
|
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
|
|
+
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
|
|
|
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
|
|
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
|
|
|
+ }
|
|
|
|
+#else
|
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
|
|
|
+#endif
|
|
ggml_set_name(inp_tokens, "inp_tokens");
|
|
ggml_set_name(inp_tokens, "inp_tokens");
|
|
|
|
|
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
|
@@ -1471,7 +1493,15 @@ static bool llama_eval_internal(
|
|
#endif
|
|
#endif
|
|
|
|
|
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
|
|
|
+
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ ggml_allocr_alloc(lctx.alloc, inpL);
|
|
|
|
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
|
|
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
|
|
|
+ }
|
|
|
|
+#else
|
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
|
|
|
+#endif
|
|
}
|
|
}
|
|
|
|
|
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
@@ -1498,6 +1528,17 @@ static bool llama_eval_internal(
|
|
}
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
|
|
|
|
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
|
|
|
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
|
|
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
|
|
|
+ }
|
|
|
|
+#else
|
|
|
|
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
|
|
|
+#endif
|
|
|
|
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
|
|
|
+
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
|
|
|
|
|
@@ -1593,9 +1634,6 @@ static bool llama_eval_internal(
|
|
ggml_set_name(KQ, "KQ");
|
|
ggml_set_name(KQ, "KQ");
|
|
|
|
|
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
|
- struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
|
|
|
- ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
|
|
|
-
|
|
|
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
|
offload_func_kq(KQ_scaled);
|
|
offload_func_kq(KQ_scaled);
|
|
@@ -1711,9 +1749,6 @@ static bool llama_eval_internal(
|
|
|
|
|
|
lctx.use_buf(ctx0, 0);
|
|
lctx.use_buf(ctx0, 0);
|
|
|
|
|
|
- // used at the end to optionally extract the embeddings
|
|
|
|
- struct ggml_tensor * embeddings = NULL;
|
|
|
|
-
|
|
|
|
// norm
|
|
// norm
|
|
{
|
|
{
|
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
|
@@ -1724,8 +1759,6 @@ static bool llama_eval_internal(
|
|
cur = ggml_mul(ctx0, cur, model.norm);
|
|
cur = ggml_mul(ctx0, cur, model.norm);
|
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
|
ggml_set_name(cur, "result_norm");
|
|
ggml_set_name(cur, "result_norm");
|
|
-
|
|
|
|
- embeddings = cur;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
// lm_head
|
|
// lm_head
|
|
@@ -1737,12 +1770,88 @@ static bool llama_eval_internal(
|
|
// logits -> probs
|
|
// logits -> probs
|
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
|
|
|
|
|
- // run the computation
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
ggml_build_forward_expand(gf, cur);
|
|
|
|
|
|
- // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
|
|
|
|
|
+ if (mem_per_token == 0) {
|
|
|
|
+ mem_per_token = ggml_used_mem(ctx0)/N;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+#if 0
|
|
|
|
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
|
|
|
+ ggml_used_mem(ctx0)/1024.0/1024.0,
|
|
|
|
+ lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
|
|
|
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
|
|
|
+ lctx.work_buffer.size()/1024.0/1024.0,
|
|
|
|
+ n_past, N);
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+ ggml_free(ctx0);
|
|
|
|
+
|
|
|
|
+ return gf;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// evaluate the transformer
|
|
|
|
+//
|
|
|
|
+// - lctx: llama context
|
|
|
|
+// - tokens: new batch of tokens to process
|
|
|
|
+// - embd embeddings input
|
|
|
|
+// - n_tokens number of tokens
|
|
|
|
+// - n_past: the context size so far
|
|
|
|
+// - n_threads: number of threads to use
|
|
|
|
+//
|
|
|
|
+static bool llama_eval_internal(
|
|
|
|
+ llama_context & lctx,
|
|
|
|
+ const llama_token * tokens,
|
|
|
|
+ const float * embd,
|
|
|
|
+ int n_tokens,
|
|
|
|
+ int n_past,
|
|
|
|
+ int n_threads,
|
|
|
|
+ const char * cgraph_fname) {
|
|
|
|
+
|
|
|
|
+ LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
|
|
|
+
|
|
|
|
+ const int64_t t_start_us = ggml_time_us();
|
|
|
|
+
|
|
|
|
+#ifdef GGML_USE_MPI
|
|
|
|
+ ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+ const int N = n_tokens;
|
|
|
|
+
|
|
|
|
+ const auto & model = lctx.model;
|
|
|
|
+ const auto & hparams = model.hparams;
|
|
|
|
+
|
|
|
|
+ const auto & kv_self = lctx.kv_self;
|
|
|
|
+
|
|
|
|
+ LLAMA_ASSERT(!!kv_self.ctx);
|
|
|
|
+
|
|
|
|
+ const int64_t n_embd = hparams.n_embd;
|
|
|
|
+ const int64_t n_vocab = hparams.n_vocab;
|
|
|
|
+
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ ggml_allocr_reset(lctx.alloc);
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+ ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
|
|
|
+
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ ggml_allocr_alloc_graph(lctx.alloc, gf);
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+ // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
|
|
+
|
|
|
|
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
|
|
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
|
|
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
|
|
|
+
|
|
|
|
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
|
|
+ struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
|
|
|
+
|
|
|
|
+ LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
|
|
|
+ LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
|
|
|
|
|
#if GGML_USE_MPI
|
|
#if GGML_USE_MPI
|
|
|
|
+ const int64_t n_layer = hparams.n_layer;
|
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
#endif
|
|
#endif
|
|
|
|
|
|
@@ -1754,7 +1863,10 @@ static bool llama_eval_internal(
|
|
//}
|
|
//}
|
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
|
- ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
|
|
|
|
|
+ ggml_metal_get_tensor (lctx.ctx_metal, res);
|
|
|
|
+ if (!lctx.embedding.empty()) {
|
|
|
|
+ ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
|
|
|
+ }
|
|
} else {
|
|
} else {
|
|
// IMPORTANT:
|
|
// IMPORTANT:
|
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
|
@@ -1785,8 +1897,6 @@ static bool llama_eval_internal(
|
|
// update kv token count
|
|
// update kv token count
|
|
lctx.kv_self.n = n_past + N;
|
|
lctx.kv_self.n = n_past + N;
|
|
|
|
|
|
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
|
|
-
|
|
|
|
if (cgraph_fname) {
|
|
if (cgraph_fname) {
|
|
ggml_graph_export(gf, cgraph_fname);
|
|
ggml_graph_export(gf, cgraph_fname);
|
|
}
|
|
}
|
|
@@ -1824,21 +1934,6 @@ static bool llama_eval_internal(
|
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
|
}
|
|
}
|
|
|
|
|
|
- if (mem_per_token == 0) {
|
|
|
|
- mem_per_token = ggml_used_mem(ctx0)/N;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
-#if 0
|
|
|
|
- printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
|
|
|
- ggml_used_mem(ctx0)/1024.0/1024.0,
|
|
|
|
- lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
|
|
|
- lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
|
|
|
- lctx.work_buffer.size()/1024.0/1024.0,
|
|
|
|
- n_past, N);
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- ggml_free(ctx0);
|
|
|
|
-
|
|
|
|
// measure the performance only for the single-token evals
|
|
// measure the performance only for the single-token evals
|
|
if (N == 1) {
|
|
if (N == 1) {
|
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
|
@@ -1950,7 +2045,9 @@ struct llama_tokenizer {
|
|
if (token == vocab_.token_to_id.end()) {
|
|
if (token == vocab_.token_to_id.end()) {
|
|
// output any symbols that did not form tokens as bytes.
|
|
// output any symbols that did not form tokens as bytes.
|
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
|
- llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
|
|
|
|
|
+ // NOTE: old version, before #2420 - not sure what are the implications of this
|
|
|
|
+ //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
|
|
|
+ llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
|
|
output.push_back(token_id);
|
|
output.push_back(token_id);
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file(
|
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
|
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
|
- params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
|
|
|
|
|
+ params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
|
params.progress_callback_user_data)) {
|
|
params.progress_callback_user_data)) {
|
|
delete model;
|
|
delete model;
|
|
@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model(
|
|
ctx->embedding.resize(hparams.n_embd);
|
|
ctx->embedding.resize(hparams.n_embd);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#ifdef LLAMA_USE_ALLOCATOR
|
|
|
|
+ {
|
|
|
|
+ static const size_t tensor_alignment = 32;
|
|
|
|
+ // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
|
|
|
+ ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
|
|
|
+
|
|
|
|
+ // create measure allocator
|
|
|
|
+ ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
|
|
|
+
|
|
|
|
+ // build worst-case graph
|
|
|
|
+ int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
|
|
|
+ int n_past = hparams.n_ctx - n_tokens;
|
|
|
|
+ llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
|
|
|
+ ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
|
|
|
+
|
|
|
|
+ // measure memory requirements for the graph
|
|
|
|
+ size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
|
|
|
+
|
|
|
|
+ fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
|
|
|
+
|
|
|
|
+ // debug - for comparison with scratch buffer
|
|
|
|
+ //size_t prev_req =
|
|
|
|
+ // MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
|
|
|
+ // MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
|
|
|
+ // MEM_REQ_EVAL().at(ctx->model.type);
|
|
|
|
+ //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
|
|
|
+
|
|
|
|
+ // recreate allocator with exact memory requirements
|
|
|
|
+ ggml_allocr_free(ctx->alloc);
|
|
|
|
+
|
|
|
|
+ ctx->buf_alloc.resize(alloc_size);
|
|
|
|
+ ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
|
|
|
+ }
|
|
|
|
+#else
|
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
|
|
|
+#endif
|
|
|
|
|
|
|
|
+#ifdef LLAMA_USE_SCRATCH
|
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
|
|
|
+#endif
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef GGML_USE_METAL
|
|
#ifdef GGML_USE_METAL
|
|
@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file(
|
|
}
|
|
}
|
|
|
|
|
|
void llama_free(struct llama_context * ctx) {
|
|
void llama_free(struct llama_context * ctx) {
|
|
- if (ctx->model_owner) {
|
|
|
|
- delete &ctx->model;
|
|
|
|
- }
|
|
|
|
delete ctx;
|
|
delete ctx;
|
|
}
|
|
}
|
|
|
|
|