1 年之前 · 22885aeaee
--- a/llm/ggml-alloc.c
+++ b/llm/ggml-alloc.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -420,6 +420,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
 
				                     if (parent == NULL) {
			
 
				                         break;
			
 
				                     }
			
 
				+
			
 
				+                    // if the node's data is external, then we cannot re-use it
			
 
				+                    if ((char *) parent->data < (char *) alloc->data ||
			
 
				+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
			
 
				+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
			
 
				+                        continue;
			
 
				+                    }
			
 
				+
			
 
				                     struct hash_node * p_hn = hash_get(ht, parent);
			
 
				                     if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
			
 
				                         if (ggml_is_view(parent)) {
			
--- a/llm/ggml-alloc.h
+++ b/llm/ggml-alloc.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-cuda.cu
+++ b/llm/ggml-cuda.cu
--- a/llm/ggml-cuda.h
+++ b/llm/ggml-cuda.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-metal.h
+++ b/llm/ggml-metal.h
@@ -1,7 +1,7 @@
 
				 //go:build darwin
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-metal.m
+++ b/llm/ggml-metal.m
@@ -1,7 +1,7 @@
 
				 //go:build darwin
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -35,6 +35,11 @@
 
				 #import <Metal/Metal.h>
			
 
				 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
			
 
				 
			
 
				+#undef MIN
			
 
				+#undef MAX
			
 
				+#define MIN(a, b) ((a) < (b) ? (a) : (b))
			
 
				+#define MAX(a, b) ((a) > (b) ? (a) : (b))
			
 
				+
			
 
				 #ifdef GGML_METAL_NDEBUG
			
 
				 #define metal_printf(...)
			
 
				 #else
			
@@ -43,6 +48,8 @@
 
				 
			
 
				 #define UNUSED(x) (void)(x)
			
 
				 
			
 
				+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
			
 
				+
			
 
				 struct ggml_metal_buffer {
			
 
				     const char * name;
			
 
				 
			
@@ -64,7 +71,7 @@ struct ggml_metal_context {
 
				     int n_buffers;
			
 
				     struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
			
 
				 
			
 
				-    int concur_list[GGML_MAX_NODES];
			
 
				+    int concur_list[GGML_MAX_CONCUR];
			
 
				     int concur_list_len;
			
 
				 
			
 
				     // custom kernels
			
@@ -398,15 +405,15 @@ void ggml_metal_graph_find_concurrency(
 
				         struct ggml_metal_context * ctx,
			
 
				         struct ggml_cgraph * gf) {
			
 
				     int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
			
 
				-    int nodes_unused[GGML_MAX_NODES];
			
 
				+    int nodes_unused[GGML_MAX_CONCUR];
			
 
				 
			
 
				-    for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
			
 
				-    for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
			
 
				+    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
			
 
				+    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
			
 
				     ctx->concur_list_len = 0;
			
 
				 
			
 
				-    int n_left = gf->n_nodes;
			
 
				-    int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
			
 
				-    int level_pos = 0;  // at ctx->concur_list, the last layer (level) ends at level_pos
			
 
				+    int n_left    = gf->n_nodes;
			
 
				+    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
			
 
				+    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
			
 
				 
			
 
				     while (n_left > 0) {
			
 
				         // number of nodes at a layer (that can be issued concurrently)
			
@@ -414,28 +421,40 @@ void ggml_metal_graph_find_concurrency(
 
				         for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
			
 
				             if (nodes_unused[i]) {
			
 
				                 // if the requirements for gf->nodes[i] are satisfied
			
 
				-                int exe_flag=1;
			
 
				+                int exe_flag = 1;
			
 
				+
			
 
				                 // scan all srcs
			
 
				                 for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
			
 
				                     struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
			
 
				                     if (src_cur) {
			
 
				                         // if is leaf nodes it's satisfied.
			
 
				-                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
			
 
				+                        // TODO: ggml_is_leaf()
			
 
				+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
			
 
				+                            continue;
			
 
				+                        }
			
 
				 
			
 
				                         // otherwise this src should be the output from previous nodes.
			
 
				                         int is_found = 0;
			
 
				+
			
 
				                         // scan 2*search_depth back because we inserted barrier.
			
 
				-                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
			
 
				-                            if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
			
 
				+                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
			
 
				+                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
			
 
				+                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
			
 
				+                                is_found = 1;
			
 
				+                                break;
			
 
				+                            }
			
 
				+                        }
			
 
				+                        if (is_found == 0) {
			
 
				+                            exe_flag = 0;
			
 
				+                            break;
			
 
				                         }
			
 
				-                        if (is_found == 0) {exe_flag = 0; break;}
			
 
				                     }
			
 
				                 }
			
 
				                 if (exe_flag) {
			
 
				                     // check if nodes[i]'s data will be overwritten by a node before nodes[i].
			
 
				                     // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
			
 
				                     int64_t data_start = (int64_t) gf->nodes[i]->data;
			
 
				-                    int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
			
 
				+                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
			
 
				                     for (int j = n_start; j < i; j++) {
			
 
				                         if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
			
 
				                                             && gf->nodes[j]->op != GGML_OP_VIEW \
			
@@ -444,9 +463,9 @@ void ggml_metal_graph_find_concurrency(
 
				                             if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
			
 
				                                 ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
			
 
				                                 continue;
			
 
				-                            } else {
			
 
				-                                exe_flag = 0;
			
 
				                             }
			
 
				+
			
 
				+                            exe_flag = 0;
			
 
				                         }
			
 
				                     }
			
 
				                 }
			
@@ -463,11 +482,13 @@ void ggml_metal_graph_find_concurrency(
 
				         ctx->concur_list[level_pos + concurrency] = -1;
			
 
				         ctx->concur_list_len++;
			
 
				         // jump all sorted nodes at nodes_bak
			
 
				-        while (!nodes_unused[n_start]) {n_start++;}
			
 
				+        while (!nodes_unused[n_start]) {
			
 
				+            n_start++;
			
 
				+        }
			
 
				         level_pos += concurrency + 1;
			
 
				     }
			
 
				 
			
 
				-    if (ctx->concur_list_len > GGML_MAX_NODES) {
			
 
				+    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
			
 
				         fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
			
 
				     }
			
 
				 }
			
@@ -481,7 +502,7 @@ void ggml_metal_graph_compute(
 
				     // else fallback to serial dispatch
			
 
				     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
			
 
				 
			
 
				-    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
			
 
				+    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
			
 
				 
			
 
				     const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
			
 
				     edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
			
--- a/llm/ggml-metal.metal
+++ b/llm/ggml-metal.metal
@@ -1,7 +1,7 @@
 
				 //go:build darwin
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-mpi.c
+++ b/llm/ggml-mpi.c
@@ -1,7 +1,7 @@
 
				 //go:build mpi
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-mpi.h
+++ b/llm/ggml-mpi.h
@@ -1,7 +1,7 @@
 
				 //go:build mpi
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-opencl.cpp
+++ b/llm/ggml-opencl.cpp
@@ -1,7 +1,7 @@
 
				 //go:build opencl
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml-opencl.h
+++ b/llm/ggml-opencl.h
@@ -1,7 +1,7 @@
 
				 //go:build opencl
			
 
				 
			
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/ggml.c
+++ b/llm/ggml.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -221,8 +221,8 @@ typedef void * thread_ret_t;
 
				 #define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
			
 
				 #define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
			
 
				 #else
			
 
				-inline static void* ggml_aligned_malloc(size_t size) {
			
 
				-    void* aligned_memory = NULL;
			
 
				+inline static void * ggml_aligned_malloc(size_t size) {
			
 
				+    void * aligned_memory = NULL;
			
 
				 #ifdef GGML_USE_METAL
			
 
				     int result = posix_memalign(&aligned_memory, getpagesize(), size);
			
 
				 #else
			
@@ -3837,7 +3837,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
 
				     "CROSS_ENTROPY_LOSS_BACK",
			
 
				 };
			
 
				 
			
 
				-static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
			
 
				+static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
			
 
				 
			
 
				 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
			
 
				     "none",
			
@@ -3909,7 +3909,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
				     "cross_entropy_loss_back(x,y)",
			
 
				 };
			
 
				 
			
 
				-static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
			
 
				+static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
			
 
				 
			
 
				 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
			
 
				 
			
@@ -4136,7 +4136,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
 
				     //
			
 
				     // is enough, but just in case, adding the second part
			
 
				 
			
 
				-    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
			
 
				+    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
			
 
				 }
			
 
				 
			
 
				 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
			
@@ -4279,7 +4279,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
 
				         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
			
 
				 }
			
 
				 
			
 
				-static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
			
 
				+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
			
 
				     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
			
 
				 
			
 
				     return
			
@@ -4628,7 +4628,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 
				         /*.ne           =*/ { 1, 1, 1, 1 },
			
 
				         /*.nb           =*/ { 0, 0, 0, 0 },
			
 
				         /*.op           =*/ GGML_OP_NONE,
			
 
				-        /*.op_params    =*/ {0},
			
 
				+        /*.op_params    =*/ { 0 },
			
 
				         /*.is_param     =*/ false,
			
 
				         /*.grad         =*/ NULL,
			
 
				         /*.src          =*/ { NULL },
			
@@ -4660,6 +4660,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
 
				 }
			
 
				 
			
 
				 static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
			
 
				+    GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
			
 
				     assert(params_size <= GGML_MAX_OP_PARAMS);
			
 
				     memcpy(tensor->op_params, params, params_size);
			
 
				 }
			
@@ -6465,7 +6466,7 @@ struct ggml_tensor * ggml_permute(
 
				     result->src[0] = a;
			
 
				 
			
 
				     int32_t params[] = { axis0, axis1, axis2, axis3 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     return result;
			
 
				 }
			
@@ -6591,7 +6592,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
 
				     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
			
 
				 
			
 
				     int32_t params[] = { n_past, inplace ? 1 : 0 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_DIAG_MASK_INF;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6631,7 +6632,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
 
				     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
			
 
				 
			
 
				     int32_t params[] = { n_past, inplace ? 1 : 0 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_DIAG_MASK_ZERO;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6747,9 +6748,9 @@ static struct ggml_tensor * ggml_rope_impl(
 
				     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
			
 
				 
			
 
				     int32_t params[6] = { n_past, n_dims, mode, n_ctx };
			
 
				-    memcpy(params + 4, &freq_base, sizeof(float));
			
 
				+    memcpy(params + 4, &freq_base,  sizeof(float));
			
 
				     memcpy(params + 5, &freq_scale, sizeof(float));
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_ROPE;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6823,7 +6824,7 @@ struct ggml_tensor * ggml_rope_back(
 
				     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
			
 
				 
			
 
				     int32_t params[] = { n_past, n_dims, mode, n_ctx };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_ROPE_BACK;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6854,7 +6855,7 @@ struct ggml_tensor * ggml_alibi(
 
				 
			
 
				     int32_t op_params[3] = { n_past, n_head };
			
 
				     memcpy(op_params + 2, &bias_max, sizeof(float));
			
 
				-    ggml_set_op_params(result, &op_params, sizeof(op_params));
			
 
				+    ggml_set_op_params(result, op_params, sizeof(op_params));
			
 
				 
			
 
				     result->op   = GGML_OP_ALIBI;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6881,7 +6882,7 @@ struct ggml_tensor * ggml_clamp(
 
				     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
			
 
				 
			
 
				     float params[] = { min, max };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_CLAMP;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6916,10 +6917,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
 
				         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
			
 
				         a->ne[2], 1, 1,
			
 
				     };
			
 
				-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
			
 
				+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
			
 
				 
			
 
				     int32_t params[] = { s0, p0, d0 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op = GGML_OP_CONV_1D;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6931,10 +6932,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
 
				 
			
 
				 // ggml_conv_2d
			
 
				 
			
 
				-struct ggml_tensor* ggml_conv_2d(
			
 
				-    struct ggml_context* ctx,
			
 
				-    struct ggml_tensor * a,
			
 
				-    struct ggml_tensor * b,
			
 
				+struct ggml_tensor * ggml_conv_2d(
			
 
				+    struct ggml_context * ctx,
			
 
				+    struct ggml_tensor  * a,
			
 
				+    struct ggml_tensor  * b,
			
 
				     int                  s0,
			
 
				     int                  s1,
			
 
				     int                  p0,
			
@@ -6955,10 +6956,10 @@ struct ggml_tensor* ggml_conv_2d(
 
				         ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
			
 
				         a->ne[3], b->ne[3],
			
 
				     };
			
 
				-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
			
 
				+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
			
 
				 
			
 
				     int32_t params[] = { s0, s1, p0, p1, d0, d1 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op = GGML_OP_CONV_2D;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -6971,7 +6972,7 @@ struct ggml_tensor* ggml_conv_2d(
 
				 
			
 
				 // ggml_conv_1d_ph
			
 
				 
			
 
				-struct ggml_tensor* ggml_conv_1d_ph(
			
 
				+struct ggml_tensor * ggml_conv_1d_ph(
			
 
				         struct ggml_context * ctx,
			
 
				         struct ggml_tensor  * a,
			
 
				         struct ggml_tensor  * b,
			
@@ -6989,7 +6990,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
 
				 
			
 
				 // ggml_pool_1d
			
 
				 
			
 
				-struct ggml_tensor* ggml_pool_1d(
			
 
				+struct ggml_tensor * ggml_pool_1d(
			
 
				         struct ggml_context * ctx,
			
 
				         struct ggml_tensor  * a,
			
 
				         enum ggml_op_pool     op,
			
@@ -7008,10 +7009,10 @@ struct ggml_tensor* ggml_pool_1d(
 
				         ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
			
 
				         a->ne[1],
			
 
				     };
			
 
				-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
			
 
				+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
			
 
				 
			
 
				     int32_t params[] = { op, k0, s0, p0 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op = GGML_OP_POOL_1D;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -7022,7 +7023,7 @@ struct ggml_tensor* ggml_pool_1d(
 
				 
			
 
				 // ggml_pool_2d
			
 
				 
			
 
				-struct ggml_tensor* ggml_pool_2d(
			
 
				+struct ggml_tensor * ggml_pool_2d(
			
 
				         struct ggml_context * ctx,
			
 
				         struct ggml_tensor  * a,
			
 
				         enum ggml_op_pool     op,
			
@@ -7045,10 +7046,10 @@ struct ggml_tensor* ggml_pool_2d(
 
				         ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
			
 
				         a->ne[2],
			
 
				     };
			
 
				-    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
			
 
				+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
			
 
				 
			
 
				     int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op = GGML_OP_POOL_2D;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -7216,7 +7217,7 @@ struct ggml_tensor * ggml_win_part(
 
				     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
			
 
				 
			
 
				     int32_t params[] = { npx, npy, w };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_WIN_PART;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -7246,7 +7247,7 @@ struct ggml_tensor * ggml_win_unpart(
 
				     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
			
 
				 
			
 
				     int32_t params[] = { w };
			
 
				-    ggml_set_op_params(result, &params, sizeof(params));
			
 
				+    ggml_set_op_params(result, params, sizeof(params));
			
 
				 
			
 
				     result->op   = GGML_OP_WIN_UNPART;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
@@ -7375,7 +7376,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
 
				     return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
			
 
				 }
			
 
				 
			
 
				-// ggml_map_custom1
			
 
				+// ggml_map_custom1_f32
			
 
				 
			
 
				 static struct ggml_tensor * ggml_map_custom1_impl_f32(
			
 
				         struct ggml_context          * ctx,
			
@@ -7392,7 +7393,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
 
				 
			
 
				     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
			
 
				 
			
 
				-    result->op = GGML_OP_MAP_CUSTOM1;
			
 
				+    result->op = GGML_OP_MAP_CUSTOM1_F32;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				     result->src[0] = a;
			
 
				 
			
@@ -7413,7 +7414,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
 
				     return ggml_map_custom1_impl_f32(ctx, a, fun, true);
			
 
				 }
			
 
				 
			
 
				-// ggml_map_custom2
			
 
				+// ggml_map_custom2_f32
			
 
				 
			
 
				 static struct ggml_tensor * ggml_map_custom2_impl_f32(
			
 
				         struct ggml_context          * ctx,
			
@@ -7431,7 +7432,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
 
				 
			
 
				     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
			
 
				 
			
 
				-    result->op = GGML_OP_MAP_CUSTOM2;
			
 
				+    result->op = GGML_OP_MAP_CUSTOM2_F32;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				     result->src[0] = a;
			
 
				     result->src[1] = b;
			
@@ -7455,7 +7456,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
 
				     return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
			
 
				 }
			
 
				 
			
 
				-// ggml_map_custom3
			
 
				+// ggml_map_custom3_f32
			
 
				 
			
 
				 static struct ggml_tensor * ggml_map_custom3_impl_f32(
			
 
				         struct ggml_context          * ctx,
			
@@ -7474,7 +7475,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
 
				 
			
 
				     ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
			
 
				 
			
 
				-    result->op = GGML_OP_MAP_CUSTOM3;
			
 
				+    result->op = GGML_OP_MAP_CUSTOM3_F32;
			
 
				     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				     result->src[0] = a;
			
 
				     result->src[1] = b;
			
@@ -7501,6 +7502,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
 
				     return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
			
 
				 }
			
 
				 
			
 
				+// ggml_map_custom1
			
 
				+struct ggml_map_custom1_op_params {
			
 
				+    ggml_custom1_op_t fun;
			
 
				+    int n_tasks;
			
 
				+    void * userdata;
			
 
				+};
			
 
				+
			
 
				+static struct ggml_tensor * ggml_map_custom1_impl(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        const  ggml_custom1_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata,
			
 
				+        bool                           inplace) {
			
 
				+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
			
 
				+
			
 
				+    bool is_node = false;
			
 
				+
			
 
				+    if (!inplace && a->grad) {
			
 
				+        is_node = true;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
			
 
				+
			
 
				+    struct ggml_map_custom1_op_params params = {
			
 
				+        /*.fun      =*/ fun,
			
 
				+        /*.n_tasks  =*/ n_tasks,
			
 
				+        /*.userdata =*/ userdata
			
 
				+    };
			
 
				+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
			
 
				+
			
 
				+    result->op = GGML_OP_MAP_CUSTOM1;
			
 
				+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				+    result->src[0] = a;
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_map_custom1(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        const  ggml_custom1_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata) {
			
 
				+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_map_custom1_inplace(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        const  ggml_custom1_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata) {
			
 
				+    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
			
 
				+}
			
 
				+
			
 
				+// ggml_map_custom2
			
 
				+
			
 
				+struct ggml_map_custom2_op_params {
			
 
				+    ggml_custom2_op_t fun;
			
 
				+    int n_tasks;
			
 
				+    void * userdata;
			
 
				+};
			
 
				+
			
 
				+static struct ggml_tensor * ggml_map_custom2_impl(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        struct ggml_tensor           * b,
			
 
				+        const  ggml_custom2_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata,
			
 
				+        bool                           inplace) {
			
 
				+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
			
 
				+
			
 
				+    bool is_node = false;
			
 
				+
			
 
				+    if (!inplace && (a->grad || b->grad)) {
			
 
				+        is_node = true;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
			
 
				+
			
 
				+    struct ggml_map_custom2_op_params params = {
			
 
				+        /*.fun      =*/ fun,
			
 
				+        /*.n_tasks  =*/ n_tasks,
			
 
				+        /*.userdata =*/ userdata
			
 
				+    };
			
 
				+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
			
 
				+
			
 
				+    result->op = GGML_OP_MAP_CUSTOM2;
			
 
				+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				+    result->src[0] = a;
			
 
				+    result->src[1] = b;
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_map_custom2(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        struct ggml_tensor           * b,
			
 
				+        const  ggml_custom2_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata) {
			
 
				+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_map_custom2_inplace(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        struct ggml_tensor           * b,
			
 
				+        const  ggml_custom2_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata) {
			
 
				+    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
			
 
				+}
			
 
				+
			
 
				+// ggml_map_custom3
			
 
				+
			
 
				+struct ggml_map_custom3_op_params {
			
 
				+    ggml_custom3_op_t fun;
			
 
				+    int n_tasks;
			
 
				+    void * userdata;
			
 
				+};
			
 
				+
			
 
				+static struct ggml_tensor * ggml_map_custom3_impl(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        struct ggml_tensor           * b,
			
 
				+        struct ggml_tensor           * c,
			
 
				+        const  ggml_custom3_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata,
			
 
				+        bool                           inplace) {
			
 
				+    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
			
 
				+
			
 
				+    bool is_node = false;
			
 
				+
			
 
				+    if (!inplace && (a->grad || b->grad || c->grad)) {
			
 
				+        is_node = true;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
			
 
				+
			
 
				+    struct ggml_map_custom3_op_params params = {
			
 
				+        /*.fun      =*/ fun,
			
 
				+        /*.n_tasks  =*/ n_tasks,
			
 
				+        /*.userdata =*/ userdata
			
 
				+    };
			
 
				+    ggml_set_op_params(result, (const void *) &params, sizeof(params));
			
 
				+
			
 
				+    result->op = GGML_OP_MAP_CUSTOM3;
			
 
				+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
			
 
				+    result->src[0] = a;
			
 
				+    result->src[1] = b;
			
 
				+    result->src[2] = c;
			
 
				+
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_map_custom3(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        struct ggml_tensor           * b,
			
 
				+        struct ggml_tensor           * c,
			
 
				+        const  ggml_custom3_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata) {
			
 
				+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_map_custom3_inplace(
			
 
				+        struct ggml_context          * ctx,
			
 
				+        struct ggml_tensor           * a,
			
 
				+        struct ggml_tensor           * b,
			
 
				+        struct ggml_tensor           * c,
			
 
				+        const  ggml_custom3_op_t       fun,
			
 
				+        int                            n_tasks,
			
 
				+        void                         * userdata) {
			
 
				+    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				 // ggml_cross_entropy_loss
			
 
				 
			
 
				 struct ggml_tensor * ggml_cross_entropy_loss(
			
@@ -9309,8 +9494,8 @@ static void ggml_compute_forward_sum_rows_f32(
 
				     for (int64_t i3 = 0; i3 < ne03; i3++) {
			
 
				         for (int64_t i2 = 0; i2 < ne02; i2++) {
			
 
				             for (int64_t i1 = 0; i1 < ne01; i1++) {
			
 
				-                float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
			
 
				-                float* dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
			
 
				+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
			
 
				+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
			
 
				                 float row_sum = 0;
			
 
				                 ggml_vec_sum_f32(ne00, &row_sum, src_row);
			
 
				                 dst_row[0] = row_sum;
			
@@ -10572,71 +10757,95 @@ static void ggml_compute_forward_mul_mat(
 
				         return;
			
 
				     }
			
 
				 
			
 
				-    // parallelize by src0 rows
			
 
				-    const int64_t dr = (ne01 + nth - 1)/nth;
			
 
				+    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
			
 
				+    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
			
 
				+
			
 
				+    const int64_t nr0 = ne01;           // src0 rows
			
 
				+    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
			
 
				 
			
 
				-    const int64_t ir10 = dr*ith;
			
 
				-    const int64_t ir11 = MIN(ir10 + dr, ne01);
			
 
				+    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
			
 
				 
			
 
				-    // src1 rows
			
 
				-    const int64_t nr1 = ne11*ne12*ne13;
			
 
				+    // distribute the thread work across the inner or outer loop based on which one is larger
			
 
				 
			
 
				-    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
			
 
				-    const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
			
 
				+    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
			
 
				+    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
			
 
				 
			
 
				-    for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
			
 
				-        const int64_t i13 = (ir1/(ne12*ne11));
			
 
				-        const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
			
 
				-        const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
			
 
				-
			
 
				-        const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
			
 
				-        const int64_t i03 = (ir0/(ne02));
			
 
				-        // Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
			
 
				-        // See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
			
 
				-        // GG: this is likely the correct way to broadcast, though need some more thought
			
 
				-        //     therefore leaving the comments to remind us for now
			
 
				-        const int64_t i02 = (i12 / (ne12 / ne02));
			
 
				-        // Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
			
 
				-        // const int64_t i02 = (ir0 - i03*ne02);
			
 
				-
			
 
				-        const int64_t i1 = i11;
			
 
				-        const int64_t i2 = i12;
			
 
				-        const int64_t i3 = i13;
			
 
				-
			
 
				-        const char * src0_row = (const char *) src0->data + (  0 + i02*nb02 + i03*nb03     );
			
 
				-
			
 
				-        // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
			
 
				-        //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
			
 
				-        //       the original src1 data pointer, so we should index using the indices directly
			
 
				-        // TODO: this is a bit of a hack, we should probably have a better way to handle this
			
 
				-        const char * src1_col = (const char *) wdata +
			
 
				-            (src1_cont || src1->type != vec_dot_type
			
 
				-             ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
			
 
				-             : (i11*nb11 + i12*nb12 + i13*nb13));
			
 
				-
			
 
				-        float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
			
 
				-
			
 
				-        for (int64_t ir = ir10; ir < ir11; ++ir) {
			
 
				-            vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
			
 
				-        }
			
 
				+    const int64_t ith0 = ith % nth0;
			
 
				+    const int64_t ith1 = ith / nth0;
			
 
				+
			
 
				+    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
			
 
				+    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
			
 
				+
			
 
				+    const int64_t ir010 = dr0*ith0;
			
 
				+    const int64_t ir011 = MIN(ir010 + dr0, nr0);
			
 
				+
			
 
				+    const int64_t ir110 = dr1*ith1;
			
 
				+    const int64_t ir111 = MIN(ir110 + dr1, nr1);
			
 
				+
			
 
				+    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
			
 
				+
			
 
				+    // threads with no work simply yield (not sure if it helps)
			
 
				+    if (ir010 >= ir011 || ir110 >= ir111) {
			
 
				+        sched_yield();
			
 
				+        return;
			
 
				     }
			
 
				 
			
 
				-    //int64_t t1 = ggml_time_us();
			
 
				-    //static int64_t acc = 0;
			
 
				-    //acc += t1 - t0;
			
 
				-    //if (t1 - t0 > 10) {
			
 
				-    //    printf("\n");
			
 
				-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
			
 
				-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
			
 
				-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
			
 
				+    assert(ne12 % ne02 == 0);
			
 
				+    assert(ne13 % ne03 == 0);
			
 
				 
			
 
				-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
			
 
				-    //}
			
 
				-}
			
 
				+    // broadcast factors
			
 
				+    const int64_t r2 = ne12/ne02;
			
 
				+    const int64_t r3 = ne13/ne03;
			
 
				 
			
 
				+    // block-tiling attempt
			
 
				+    const int64_t blck_0 = 16;
			
 
				+    const int64_t blck_1 = 16;
			
 
				 
			
 
				-// ggml_compute_forward_out_prod
			
 
				+    // attempt to reduce false-sharing (does not seem to make a difference)
			
 
				+    float tmp[16];
			
 
				 
			
 
				+    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
			
 
				+        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
			
 
				+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
			
 
				+                const int64_t i13 = (ir1/(ne12*ne11));
			
 
				+                const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
			
 
				+                const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
			
 
				+
			
 
				+                // broadcast src0 into src1
			
 
				+                const int64_t i03 = i13/r3;
			
 
				+                const int64_t i02 = i12/r2;
			
 
				+
			
 
				+                const int64_t i1 = i11;
			
 
				+                const int64_t i2 = i12;
			
 
				+                const int64_t i3 = i13;
			
 
				+
			
 
				+                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
			
 
				+
			
 
				+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
			
 
				+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
			
 
				+                //       the original src1 data pointer, so we should index using the indices directly
			
 
				+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
			
 
				+                const char * src1_col = (const char *) wdata +
			
 
				+                    (src1_cont || src1->type != vec_dot_type
			
 
				+                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
			
 
				+                     : (i11*nb11 + i12*nb12 + i13*nb13));
			
 
				+
			
 
				+                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
			
 
				+
			
 
				+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
			
 
				+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
			
 
				+                //}
			
 
				+
			
 
				+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
			
 
				+                    vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
			
 
				+                }
			
 
				+                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// ggml_compute_forward_out_prod
			
 
				 
			
 
				 static void ggml_compute_forward_out_prod_f32(
			
 
				         const struct ggml_compute_params * params,
			
@@ -12920,7 +13129,7 @@ static void ggml_compute_forward_pool_1d(
 
				         const struct ggml_tensor * src0,
			
 
				               struct ggml_tensor * dst) {
			
 
				 
			
 
				-    const int32_t* opts = (const int32_t*)dst->op_params;
			
 
				+    const int32_t * opts = (const int32_t *)dst->op_params;
			
 
				     enum ggml_op_pool op = opts[0];
			
 
				     const int k0 = opts[1];
			
 
				     const int s0 = opts[2];
			
@@ -14253,24 +14462,6 @@ static void ggml_compute_forward_map_custom1_f32(
 
				     fun(dst, a);
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static void ggml_compute_forward_map_custom1(
			
 
				-        const struct ggml_compute_params * params,
			
 
				-        const struct ggml_tensor * a,
			
 
				-        struct ggml_tensor * dst,
			
 
				-        const ggml_custom1_op_f32_t fun) {
			
 
				-    switch (a->type) {
			
 
				-        case GGML_TYPE_F32:
			
 
				-            {
			
 
				-                ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
			
 
				-            } break;
			
 
				-        default:
			
 
				-            {
			
 
				-                GGML_ASSERT(false);
			
 
				-            } break;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 // ggml_compute_forward_map_custom2
			
 
				 
			
 
				 static void ggml_compute_forward_map_custom2_f32(
			
@@ -14289,24 +14480,6 @@ static void ggml_compute_forward_map_custom2_f32(
 
				 }
			
 
				 
			
 
				 
			
 
				-static void ggml_compute_forward_map_custom2(
			
 
				-        const struct ggml_compute_params * params,
			
 
				-        const struct ggml_tensor * a,
			
 
				-        const struct ggml_tensor * b,
			
 
				-        struct ggml_tensor * dst,
			
 
				-        const ggml_custom2_op_f32_t fun) {
			
 
				-    switch (a->type) {
			
 
				-        case GGML_TYPE_F32:
			
 
				-            {
			
 
				-                ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
			
 
				-            } break;
			
 
				-        default:
			
 
				-            {
			
 
				-                GGML_ASSERT(false);
			
 
				-            } break;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 // ggml_compute_forward_map_custom3
			
 
				 
			
 
				 static void ggml_compute_forward_map_custom3_f32(
			
@@ -14325,24 +14498,52 @@ static void ggml_compute_forward_map_custom3_f32(
 
				     fun(dst, a, b, c);
			
 
				 }
			
 
				 
			
 
				+// ggml_compute_forward_map_custom1
			
 
				+
			
 
				+static void ggml_compute_forward_map_custom1(
			
 
				+        const struct ggml_compute_params * params,
			
 
				+        const struct ggml_tensor * a,
			
 
				+              struct ggml_tensor * dst) {
			
 
				+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
			
 
				+
			
 
				+    p->fun(dst, a, params->ith, params->nth, p->userdata);
			
 
				+}
			
 
				+
			
 
				+// ggml_compute_forward_map_custom2
			
 
				+
			
 
				+static void ggml_compute_forward_map_custom2(
			
 
				+        const struct ggml_compute_params * params,
			
 
				+        const struct ggml_tensor * a,
			
 
				+        const struct ggml_tensor * b,
			
 
				+              struct ggml_tensor * dst) {
			
 
				+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
			
 
				+
			
 
				+    p->fun(dst, a, b, params->ith, params->nth, p->userdata);
			
 
				+}
			
 
				+
			
 
				+// ggml_compute_forward_map_custom3
			
 
				 
			
 
				 static void ggml_compute_forward_map_custom3(
			
 
				         const struct ggml_compute_params * params,
			
 
				         const struct ggml_tensor * a,
			
 
				         const struct ggml_tensor * b,
			
 
				         const struct ggml_tensor * c,
			
 
				-        struct ggml_tensor * dst,
			
 
				-        const ggml_custom3_op_f32_t fun) {
			
 
				-    switch (a->type) {
			
 
				-        case GGML_TYPE_F32:
			
 
				-            {
			
 
				-                ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
			
 
				-            } break;
			
 
				-        default:
			
 
				-            {
			
 
				-                GGML_ASSERT(false);
			
 
				-            } break;
			
 
				+              struct ggml_tensor * dst) {
			
 
				+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
			
 
				+        return;
			
 
				     }
			
 
				+
			
 
				+    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
			
 
				+
			
 
				+    p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
			
 
				 }
			
 
				 
			
 
				 // ggml_compute_forward_cross_entropy_loss
			
@@ -14864,25 +15065,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
				                 ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
			
 
				             }
			
 
				             break;
			
 
				-        case GGML_OP_MAP_CUSTOM1:
			
 
				+        case GGML_OP_MAP_CUSTOM1_F32:
			
 
				             {
			
 
				                 ggml_custom1_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				-                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
			
 
				+                ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
			
 
				             }
			
 
				             break;
			
 
				-        case GGML_OP_MAP_CUSTOM2:
			
 
				+        case GGML_OP_MAP_CUSTOM2_F32:
			
 
				             {
			
 
				                 ggml_custom2_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				-                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
			
 
				+                ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
			
 
				             }
			
 
				             break;
			
 
				-        case GGML_OP_MAP_CUSTOM3:
			
 
				+        case GGML_OP_MAP_CUSTOM3_F32:
			
 
				             {
			
 
				                 ggml_custom3_op_f32_t fun;
			
 
				                 memcpy(&fun, tensor->op_params, sizeof(fun));
			
 
				-                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
			
 
				+                ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
			
 
				+            }
			
 
				+            break;
			
 
				+        case GGML_OP_MAP_CUSTOM1:
			
 
				+            {
			
 
				+                ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
			
 
				+            }
			
 
				+            break;
			
 
				+        case GGML_OP_MAP_CUSTOM2:
			
 
				+            {
			
 
				+                ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
			
 
				+            }
			
 
				+            break;
			
 
				+        case GGML_OP_MAP_CUSTOM3:
			
 
				+            {
			
 
				+                ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
			
 
				             }
			
 
				             break;
			
 
				         case GGML_OP_CROSS_ENTROPY_LOSS:
			
@@ -15690,6 +15906,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
 
				             } break;
			
 
				         case GGML_OP_MAP_UNARY:
			
 
				         case GGML_OP_MAP_BINARY:
			
 
				+        case GGML_OP_MAP_CUSTOM1_F32:
			
 
				+        case GGML_OP_MAP_CUSTOM2_F32:
			
 
				+        case GGML_OP_MAP_CUSTOM3_F32:
			
 
				         case GGML_OP_MAP_CUSTOM1:
			
 
				         case GGML_OP_MAP_CUSTOM2:
			
 
				         case GGML_OP_MAP_CUSTOM3:
			
@@ -16475,11 +16694,38 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
 
				             case GGML_OP_WIN_UNPART:
			
 
				             case GGML_OP_MAP_UNARY:
			
 
				             case GGML_OP_MAP_BINARY:
			
 
				+            case GGML_OP_MAP_CUSTOM1_F32:
			
 
				+            case GGML_OP_MAP_CUSTOM2_F32:
			
 
				+            case GGML_OP_MAP_CUSTOM3_F32:
			
 
				+                {
			
 
				+                    n_tasks = 1;
			
 
				+                } break;
			
 
				             case GGML_OP_MAP_CUSTOM1:
			
 
				+                {
			
 
				+                    struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
			
 
				+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
			
 
				+                        n_tasks = n_threads;
			
 
				+                    } else {
			
 
				+                        n_tasks = MIN(p->n_tasks, n_threads);
			
 
				+                    }
			
 
				+                } break;
			
 
				             case GGML_OP_MAP_CUSTOM2:
			
 
				+                {
			
 
				+                    struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
			
 
				+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
			
 
				+                        n_tasks = n_threads;
			
 
				+                    } else {
			
 
				+                        n_tasks = MIN(p->n_tasks, n_threads);
			
 
				+                    }
			
 
				+                } break;
			
 
				             case GGML_OP_MAP_CUSTOM3:
			
 
				                 {
			
 
				-                    n_tasks = 1;
			
 
				+                    struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
			
 
				+                    if (p->n_tasks == GGML_N_TASKS_MAX) {
			
 
				+                        n_tasks = n_threads;
			
 
				+                    } else {
			
 
				+                        n_tasks = MIN(p->n_tasks, n_threads);
			
 
				+                    }
			
 
				                 } break;
			
 
				             case GGML_OP_CROSS_ENTROPY_LOSS:
			
 
				                 {
			
--- a/llm/ggml.h
+++ b/llm/ggml.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -209,6 +209,15 @@
 
				 #    define GGML_API
			
 
				 #endif
			
 
				 
			
 
				+// TODO: support for clang
			
 
				+#ifdef __GNUC__
			
 
				+#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
			
 
				+#elif defined(_MSC_VER)
			
 
				+#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
			
 
				+#else
			
 
				+#    define GGML_DEPRECATED(func, hint) func
			
 
				+#endif
			
 
				+
			
 
				 #include <stdint.h>
			
 
				 #include <stddef.h>
			
 
				 #include <stdbool.h>
			
@@ -400,6 +409,10 @@ extern "C" {
 
				         GGML_OP_MAP_UNARY,
			
 
				         GGML_OP_MAP_BINARY,
			
 
				 
			
 
				+        GGML_OP_MAP_CUSTOM1_F32,
			
 
				+        GGML_OP_MAP_CUSTOM2_F32,
			
 
				+        GGML_OP_MAP_CUSTOM3_F32,
			
 
				+
			
 
				         GGML_OP_MAP_CUSTOM1,
			
 
				         GGML_OP_MAP_CUSTOM2,
			
 
				         GGML_OP_MAP_CUSTOM3,
			
@@ -596,6 +609,8 @@ extern "C" {
 
				     GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
			
 
				     GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
			
 
				 
			
 
				+    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
			
 
				+
			
 
				     // use this to compute the memory overhead of a tensor
			
 
				     GGML_API size_t ggml_tensor_overhead(void);
			
 
				 
			
@@ -1266,7 +1281,7 @@ extern "C" {
 
				 
			
 
				     // conv_1d with padding = half
			
 
				     // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
			
 
				-    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
			
 
				+    GGML_API struct ggml_tensor * ggml_conv_1d_ph(
			
 
				             struct ggml_context * ctx,
			
 
				             struct ggml_tensor  * a,
			
 
				             struct ggml_tensor  * b,
			
@@ -1279,7 +1294,7 @@ extern "C" {
 
				         GGML_OP_POOL_COUNT,
			
 
				     };
			
 
				 
			
 
				-    GGML_API struct ggml_tensor* ggml_pool_1d(
			
 
				+    GGML_API struct ggml_tensor * ggml_pool_1d(
			
 
				             struct ggml_context * ctx,
			
 
				             struct ggml_tensor  * a,
			
 
				             enum ggml_op_pool     op,
			
@@ -1287,7 +1302,7 @@ extern "C" {
 
				             int                   s0, // stride
			
 
				             int                   p0); // padding
			
 
				 
			
 
				-    GGML_API struct ggml_tensor* ggml_pool_2d(
			
 
				+    GGML_API struct ggml_tensor * ggml_pool_2d(
			
 
				             struct ggml_context * ctx,
			
 
				             struct ggml_tensor  * a,
			
 
				             enum ggml_op_pool     op,
			
@@ -1341,15 +1356,6 @@ extern "C" {
 
				             int                   h0,
			
 
				             int                   w);
			
 
				 
			
 
				-    // custom operators
			
 
				-
			
 
				-    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
			
 
				-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
			
 
				-
			
 
				-    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
			
 
				-    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
			
 
				-    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
			
 
				-
			
 
				     GGML_API struct ggml_tensor * ggml_unary(
			
 
				             struct ggml_context * ctx,
			
 
				              struct ggml_tensor * a,
			
@@ -1360,63 +1366,138 @@ extern "C" {
 
				         struct ggml_tensor  * a,
			
 
				         enum ggml_unary_op op);
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_unary_f32(
			
 
				+    // custom operators
			
 
				+
			
 
				+    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
			
 
				+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
			
 
				+
			
 
				+    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
			
 
				+    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
			
 
				+    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
			
 
				+
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
			
 
				             struct ggml_context        * ctx,
			
 
				             struct ggml_tensor         * a,
			
 
				-                   ggml_unary_op_f32_t   fun);
			
 
				+                   ggml_unary_op_f32_t   fun),
			
 
				+        "use ggml_map_custom1 instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
			
 
				             struct ggml_context        * ctx,
			
 
				             struct ggml_tensor         * a,
			
 
				-                   ggml_unary_op_f32_t   fun);
			
 
				+                   ggml_unary_op_f32_t   fun),
			
 
				+        "use ggml_map_custom1_inplace instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_binary_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
			
 
				             struct ggml_context         * ctx,
			
 
				             struct ggml_tensor          * a,
			
 
				             struct ggml_tensor          * b,
			
 
				-                   ggml_binary_op_f32_t   fun);
			
 
				+                   ggml_binary_op_f32_t   fun),
			
 
				+        "use ggml_map_custom2 instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
			
 
				             struct ggml_context         * ctx,
			
 
				             struct ggml_tensor          * a,
			
 
				             struct ggml_tensor          * b,
			
 
				-                   ggml_binary_op_f32_t   fun);
			
 
				+                   ggml_binary_op_f32_t   fun),
			
 
				+        "use ggml_map_custom2_inplace instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_custom1_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
			
 
				             struct ggml_context          * ctx,
			
 
				             struct ggml_tensor           * a,
			
 
				-                   ggml_custom1_op_f32_t   fun);
			
 
				+                   ggml_custom1_op_f32_t   fun),
			
 
				+        "use ggml_map_custom1 instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
			
 
				             struct ggml_context          * ctx,
			
 
				             struct ggml_tensor           * a,
			
 
				-                   ggml_custom1_op_f32_t   fun);
			
 
				+                   ggml_custom1_op_f32_t   fun),
			
 
				+        "use ggml_map_custom1_inplace instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_custom2_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
			
 
				             struct ggml_context          * ctx,
			
 
				             struct ggml_tensor           * a,
			
 
				             struct ggml_tensor           * b,
			
 
				-                   ggml_custom2_op_f32_t   fun);
			
 
				+                   ggml_custom2_op_f32_t   fun),
			
 
				+        "use ggml_map_custom2 instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
			
 
				             struct ggml_context          * ctx,
			
 
				             struct ggml_tensor           * a,
			
 
				             struct ggml_tensor           * b,
			
 
				-                   ggml_custom2_op_f32_t   fun);
			
 
				+                   ggml_custom2_op_f32_t   fun),
			
 
				+        "use ggml_map_custom2_inplace instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_custom3_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
			
 
				             struct ggml_context          * ctx,
			
 
				             struct ggml_tensor           * a,
			
 
				             struct ggml_tensor           * b,
			
 
				             struct ggml_tensor           * c,
			
 
				-                   ggml_custom3_op_f32_t   fun);
			
 
				+                   ggml_custom3_op_f32_t   fun),
			
 
				+        "use ggml_map_custom3 instead");
			
 
				 
			
 
				-    GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
			
 
				+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
			
 
				             struct ggml_context          * ctx,
			
 
				             struct ggml_tensor           * a,
			
 
				             struct ggml_tensor           * b,
			
 
				             struct ggml_tensor           * c,
			
 
				-                   ggml_custom3_op_f32_t   fun);
			
 
				+                   ggml_custom3_op_f32_t   fun),
			
 
				+        "use ggml_map_custom3_inplace instead");
			
 
				+
			
 
				+    // custom operators v2
			
 
				+
			
 
				+    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
			
 
				+    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
			
 
				+    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
			
 
				+
			
 
				+    #define GGML_N_TASKS_MAX -1
			
 
				+
			
 
				+    GGML_API struct ggml_tensor * ggml_map_custom1(
			
 
				+            struct ggml_context   * ctx,
			
 
				+            struct ggml_tensor    * a,
			
 
				+            ggml_custom1_op_t       fun,
			
 
				+            int                     n_tasks,
			
 
				+            void                  * userdata);
			
 
				+
			
 
				+    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
			
 
				+            struct ggml_context   * ctx,
			
 
				+            struct ggml_tensor    * a,
			
 
				+            ggml_custom1_op_t       fun,
			
 
				+            int                     n_tasks,
			
 
				+            void                  * userdata);
			
 
				+
			
 
				+    GGML_API struct ggml_tensor * ggml_map_custom2(
			
 
				+            struct ggml_context   * ctx,
			
 
				+            struct ggml_tensor    * a,
			
 
				+            struct ggml_tensor    * b,
			
 
				+            ggml_custom2_op_t       fun,
			
 
				+            int                     n_tasks,
			
 
				+            void                  * userdata);
			
 
				+
			
 
				+    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
			
 
				+            struct ggml_context   * ctx,
			
 
				+            struct ggml_tensor    * a,
			
 
				+            struct ggml_tensor    * b,
			
 
				+            ggml_custom2_op_t       fun,
			
 
				+            int                     n_tasks,
			
 
				+            void                  * userdata);
			
 
				+
			
 
				+    GGML_API struct ggml_tensor * ggml_map_custom3(
			
 
				+            struct ggml_context   * ctx,
			
 
				+            struct ggml_tensor    * a,
			
 
				+            struct ggml_tensor    * b,
			
 
				+            struct ggml_tensor    * c,
			
 
				+            ggml_custom3_op_t       fun,
			
 
				+            int                     n_tasks,
			
 
				+            void                  * userdata);
			
 
				+
			
 
				+    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
			
 
				+            struct ggml_context   * ctx,
			
 
				+            struct ggml_tensor    * a,
			
 
				+            struct ggml_tensor    * b,
			
 
				+            struct ggml_tensor    * c,
			
 
				+            ggml_custom3_op_t       fun,
			
 
				+            int                     n_tasks,
			
 
				+            void                  * userdata);
			
 
				 
			
 
				     // loss function
			
 
				 
			
--- a/llm/k_quants.c
+++ b/llm/k_quants.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/k_quants.h
+++ b/llm/k_quants.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llm/llama-util.h
+++ b/llm/llama-util.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -175,6 +175,46 @@ struct llama_file {
 
				     }
			
 
				 };
			
 
				 
			
 
				+// llama_context_data
			
 
				+struct llama_data_context {
			
 
				+    virtual void write(const void * src, size_t size) = 0;
			
 
				+    virtual size_t get_size_written() = 0;
			
 
				+    virtual ~llama_data_context() = default;
			
 
				+};
			
 
				+
			
 
				+struct llama_data_buffer_context : llama_data_context {
			
 
				+    uint8_t* ptr;
			
 
				+    size_t size_written = 0;
			
 
				+
			
 
				+    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
			
 
				+
			
 
				+    void write(const void * src, size_t size) override {
			
 
				+        memcpy(ptr, src, size);
			
 
				+        ptr += size;
			
 
				+        size_written += size;
			
 
				+    }
			
 
				+
			
 
				+    size_t get_size_written() override {
			
 
				+        return size_written;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+struct llama_data_file_context : llama_data_context {
			
 
				+    llama_file* file;
			
 
				+    size_t size_written = 0;
			
 
				+
			
 
				+    llama_data_file_context(llama_file * f) : file(f) {}
			
 
				+
			
 
				+    void write(const void * src, size_t size) override {
			
 
				+        file->write_raw(src, size);
			
 
				+        size_written += size;
			
 
				+    }
			
 
				+
			
 
				+    size_t get_size_written() override {
			
 
				+        return size_written;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				 #if defined(_WIN32)
			
 
				 static std::string llama_format_win_err(DWORD err) {
			
 
				     LPSTR buf;
			
@@ -205,7 +245,7 @@ struct llama_mmap {
 
				         // prefetch/readahead impairs performance on NUMA systems
			
 
				         if (numa) { prefetch = 0; }
			
 
				 #ifdef __linux__
			
 
				-        if (prefetch) { flags |= MAP_POPULATE; }
			
 
				+        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
			
 
				 #endif
			
 
				         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
			
 
				         if (addr == MAP_FAILED) {
			
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -82,6 +82,13 @@
 
				 #pragma warning(disable: 4244 4267) // possible loss of data
			
 
				 #endif
			
 
				 
			
 
				+static void llama_log_internal(llama_log_level level, const char* format, ...);
			
 
				+static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
			
 
				+#define LLAMA_LOG_INFO(...)  llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
			
 
				+#define LLAMA_LOG_WARN(...)  llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
			
 
				+#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
			
 
				+
			
 
				+
			
 
				 #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
			
 
				 #include "ggml-alloc.h"
			
 
				 #define LLAMA_USE_ALLOCATOR
			
@@ -175,7 +182,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 
				 }
			
 
				 
			
 
				 // amount of VRAM needed per batch size to hold temporary results
			
 
				-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
			
 
				+// the values for 3b are not derived from testing but instead chosen conservatively
			
 
				 static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
			
 
				 {
			
 
				     static std::map<e_model, size_t> k_sizes = {
			
@@ -183,14 +190,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
 
				         { MODEL_7B,   512ull * kB },
			
 
				         { MODEL_13B,  640ull * kB },
			
 
				         { MODEL_30B,  768ull * kB },
			
 
				-        { MODEL_65B, 1536ull * kB },
			
 
				-        { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
			
 
				+        { MODEL_65B, 1280ull * kB },
			
 
				+        { MODEL_70B, 1280ull * kB },
			
 
				     };
			
 
				     return k_sizes;
			
 
				 }
			
 
				 
			
 
				 // amount of VRAM needed per batch size and context to hold temporary results
			
 
				-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
			
 
				+// the values for 3b are not derived from testing but instead chosen conservatively
			
 
				 static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
			
 
				 {
			
 
				     static std::map<e_model, size_t> k_sizes = {
			
@@ -198,8 +205,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
 
				         { MODEL_7B,  128ull },
			
 
				         { MODEL_13B, 160ull },
			
 
				         { MODEL_30B, 208ull },
			
 
				-        { MODEL_65B, 416ull },
			
 
				-        { MODEL_70B, 416ull }, // TODO (likely can be reduced)
			
 
				+        { MODEL_65B, 256ull },
			
 
				+        { MODEL_70B, 256ull },
			
 
				     };
			
 
				     return k_sizes;
			
 
				 }
			
@@ -464,6 +471,14 @@ struct llama_context {
 
				     }
			
 
				 };
			
 
				 
			
 
				+struct llama_state {
			
 
				+    // We save the log callback globally
			
 
				+    llama_log_callback log_callback = llama_log_callback_default;
			
 
				+    void * log_callback_user_data = nullptr;
			
 
				+};
			
 
				+// global state
			
 
				+static llama_state g_state;
			
 
				+
			
 
				 template <typename T>
			
 
				 static T checked_mul(T a, T b) {
			
 
				     T ret = a * b;
			
@@ -530,7 +545,7 @@ struct llama_file_loader {
 
				 
			
 
				     llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map)
			
 
				         : file(fname, "rb") {
			
 
				-        fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
			
 
				+        LLAMA_LOG_INFO("llama.cpp: loading model from %s\n", fname);
			
 
				         read_magic();
			
 
				         read_hparams();
			
 
				         read_vocab();
			
@@ -645,7 +660,7 @@ struct llama_file_saver {
 
				     llama_file_loader * any_file_loader;
			
 
				     llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
			
 
				         : file(fname, "wb"), any_file_loader(any_file_loader) {
			
 
				-        fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
			
 
				+        LLAMA_LOG_INFO("llama.cpp: saving model to %s\n", fname);
			
 
				         write_magic();
			
 
				         write_hparams(new_ftype);
			
 
				         write_vocab();
			
@@ -666,7 +681,7 @@ struct llama_file_saver {
 
				     }
			
 
				     void write_vocab() {
			
 
				         if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
			
 
				-            fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
			
 
				+            LLAMA_LOG_WARN("llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
			
 
				         }
			
 
				         uint32_t n_vocab = any_file_loader->hparams.n_vocab;
			
 
				         for (uint32_t i = 0; i < n_vocab; i++) {
			
@@ -773,12 +788,12 @@ struct llama_model_loader {
 
				 
			
 
				     void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
			
 
				         size_t data_size = 0;
			
 
				-        size_t prefetch_size = 0;
			
 
				+        size_t prefetch_size = file_loader->file.size;
			
 
				         size_t lock_size = 0;
			
 
				         for (const llama_load_tensor & lt : tensors_map.tensors) {
			
 
				             data_size += lt.size;
			
 
				-            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
			
 
				-                prefetch_size += lt.size;
			
 
				+            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
			
 
				+                prefetch_size -= lt.size;
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -857,7 +872,7 @@ struct llama_model_loader {
 
				             uint8_t byte = lt.data[i];
			
 
				             sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
			
 
				         }
			
 
				-        fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
			
 
				+        LLAMA_LOG_INFO("%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
			
 
				                 llama_format_tensor_shape(lt.ne).c_str(), lt.size);
			
 
				     }
			
 
				 
			
@@ -890,7 +905,7 @@ static bool kv_cache_init(
 
				     cache.ctx = ggml_init(params);
			
 
				 
			
 
				     if (!cache.ctx) {
			
 
				-        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
			
 
				+        LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -1102,7 +1117,7 @@ static void llama_model_load_internal(
 
				         LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
			
 
				         hparams.n_head_kv = hparams.n_head / n_gqa;
			
 
				         if (model.type == e_model::MODEL_65B && n_gqa == 8) {
			
 
				-            fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
			
 
				+            LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
			
 
				             model.type = e_model::MODEL_70B;
			
 
				             hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
			
 
				         }
			
@@ -1118,22 +1133,22 @@ static void llama_model_load_internal(
 
				     //const uint32_t n_ff = 28672;
			
 
				 
			
 
				     {
			
 
				-        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
			
 
				-        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
			
 
				-        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
			
 
				-        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
			
 
				-        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
			
 
				-        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
			
 
				-        fprintf(stderr, "%s: n_head_kv  = %u\n",   __func__, hparams.n_head_kv);
			
 
				-        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
			
 
				-        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
			
 
				-        fprintf(stderr, "%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
			
 
				-        fprintf(stderr, "%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
			
 
				-        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
			
 
				-        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
			
 
				-        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
			
 
				-        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
			
 
				-        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
			
 
				+        LLAMA_LOG_INFO("%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
			
 
				+        LLAMA_LOG_INFO("%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
			
 
				+        LLAMA_LOG_INFO("%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
			
 
				+        LLAMA_LOG_INFO("%s: n_embd     = %u\n",   __func__, hparams.n_embd);
			
 
				+        LLAMA_LOG_INFO("%s: n_mult     = %u\n",   __func__, hparams.n_mult);
			
 
				+        LLAMA_LOG_INFO("%s: n_head     = %u\n",   __func__, hparams.n_head);
			
 
				+        LLAMA_LOG_INFO("%s: n_head_kv  = %u\n",   __func__, hparams.n_head_kv);
			
 
				+        LLAMA_LOG_INFO("%s: n_layer    = %u\n",   __func__, hparams.n_layer);
			
 
				+        LLAMA_LOG_INFO("%s: n_rot      = %u\n",   __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
			
 
				+        LLAMA_LOG_INFO("%s: n_gqa      = %u\n",   __func__, hparams.n_gqa());
			
 
				+        LLAMA_LOG_INFO("%s: rnorm_eps  = %.1e\n", __func__, hparams.f_rms_norm_eps);
			
 
				+        LLAMA_LOG_INFO("%s: n_ff       = %u\n",   __func__, n_ff);
			
 
				+        LLAMA_LOG_INFO("%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
			
 
				+        LLAMA_LOG_INFO("%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
			
 
				+        LLAMA_LOG_INFO("%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
			
 
				+        LLAMA_LOG_INFO("%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
			
 
				     }
			
 
				 
			
 
				     if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
			
@@ -1161,7 +1176,7 @@ static void llama_model_load_internal(
 
				     size_t ctx_size;
			
 
				     size_t mmapped_size;
			
 
				     ml->calc_sizes(&ctx_size, &mmapped_size);
			
 
				-    fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
			
 
				+    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
			
 
				 
			
 
				     // create the ggml context
			
 
				     {
			
@@ -1186,13 +1201,13 @@ static void llama_model_load_internal(
 
				     (void) main_gpu;
			
 
				     (void) mul_mat_q;
			
 
				 #if defined(GGML_USE_CUBLAS)
			
 
				-    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
			
 
				+    LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
			
 
				     ggml_cuda_set_main_device(main_gpu);
			
 
				     ggml_cuda_set_mul_mat_q(mul_mat_q);
			
 
				 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
			
 
				 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
			
 
				 #elif defined(GGML_USE_CLBLAST)
			
 
				-    fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
			
 
				+    LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
			
 
				 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
			
 
				 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
			
 
				 #else
			
@@ -1297,14 +1312,14 @@ static void llama_model_load_internal(
 
				         const size_t mem_required_state =
			
 
				             scale*hparams.kv_size();
			
 
				 
			
 
				-        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
			
 
				+        LLAMA_LOG_INFO("%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
			
 
				                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
			
 
				 
			
 
				         (void) vram_scratch;
			
 
				         (void) n_batch;
			
 
				 #ifdef GGML_USE_CUBLAS
			
 
				         if (low_vram) {
			
 
				-            fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
			
 
				+            LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
			
 
				             ggml_cuda_set_scratch_size(0); // disable scratch
			
 
				         } else {
			
 
				             const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
			
@@ -1312,7 +1327,7 @@ static void llama_model_load_internal(
 
				             vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
			
 
				             ggml_cuda_set_scratch_size(vram_scratch);
			
 
				             if (n_gpu_layers > 0) {
			
 
				-                fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
			
 
				+                LLAMA_LOG_INFO("%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
			
 
				                         __func__, vram_scratch_base / kB, vram_scratch_per_context,
			
 
				                         (vram_scratch + MB - 1) / MB); // round up
			
 
				             }
			
@@ -1322,9 +1337,9 @@ static void llama_model_load_internal(
 
				 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
			
 
				         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
			
 
				 
			
 
				-        fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
			
 
				+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
			
 
				         if (n_gpu_layers > (int) hparams.n_layer) {
			
 
				-            fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
			
 
				+            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
			
 
				         }
			
 
				         size_t vram_kv_cache = 0;
			
 
				 
			
@@ -1333,17 +1348,17 @@ static void llama_model_load_internal(
 
				         const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
			
 
				         if (n_gpu_layers > (int) hparams.n_layer + 1) {
			
 
				             if (low_vram) {
			
 
				-                fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
			
 
				+                LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
			
 
				             } else {
			
 
				-                fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
			
 
				+                LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
			
 
				                 vram_kv_cache += hparams.kv_size() / 2;
			
 
				             }
			
 
				         }
			
 
				         if (n_gpu_layers > (int) hparams.n_layer + 2) {
			
 
				             if (low_vram) {
			
 
				-                fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
			
 
				+                LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
			
 
				             } else {
			
 
				-                fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
			
 
				+                LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
			
 
				                 vram_kv_cache += hparams.kv_size() / 2;
			
 
				             }
			
 
				         }
			
@@ -1352,9 +1367,9 @@ static void llama_model_load_internal(
 
				         const int max_offloadable_layers = hparams.n_layer + 1;
			
 
				 #endif // GGML_USE_CUBLAS
			
 
				 
			
 
				-        fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
			
 
				+        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
			
 
				                 __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
			
 
				-        fprintf(stderr, "%s: total VRAM used: %zu MB\n",
			
 
				+        LLAMA_LOG_INFO("%s: total VRAM used: %zu MB\n",
			
 
				                 __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
			
 
				 #else
			
 
				         (void) n_gpu_layers;
			
@@ -1413,7 +1428,7 @@ static bool llama_model_load(
 
				                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
			
 
				         return true;
			
 
				     } catch (const std::exception & err) {
			
 
				-        fprintf(stderr, "error loading model: %s\n", err.what());
			
 
				+        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
			
 
				         return false;
			
 
				     }
			
 
				 }
			
@@ -1777,7 +1792,7 @@ static struct ggml_cgraph * llama_build_graph(
 
				     }
			
 
				 
			
 
				 #if 0
			
 
				-    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
			
 
				+    LLAMA_LOG_INFO("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
			
 
				             ggml_used_mem(ctx0)/1024.0/1024.0,
			
 
				             lctx.get_buf_max_mem(0)/1024.0/1024.0,
			
 
				             lctx.get_buf_max_mem(1)/1024.0/1024.0,
			
@@ -1838,7 +1853,7 @@ static bool llama_eval_internal(
 
				     ggml_allocr_alloc_graph(lctx.alloc, gf);
			
 
				 #endif
			
 
				 
			
 
				-    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
			
 
				+    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
			
 
				 
			
 
				     // for big prompts, if BLAS is enabled, it is better to use only one thread
			
 
				     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
			
@@ -2025,7 +2040,7 @@ struct llama_tokenizer {
 
				             left_sym.n += right_sym.n;
			
 
				             right_sym.n = 0;
			
 
				 
			
 
				-            //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
			
 
				+            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
			
 
				 
			
 
				             // remove the right sym from the chain
			
 
				             left_sym.next = right_sym.next;
			
@@ -3033,7 +3048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				         tensor.data = read_data.addr;
			
 
				         model_loader->load_data_for(tensor);
			
 
				 
			
 
				-        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
			
 
				+        LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
			
 
				                ++idx, model_loader->tensors_map.tensors.size(),
			
 
				                tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
			
 
				                ggml_type_name(tensor.type));
			
@@ -3055,7 +3070,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				             new_type = tensor.type;
			
 
				             new_data = tensor.data;
			
 
				             new_size = tensor.size;
			
 
				-            printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
			
 
				+            LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
			
 
				         } else {
			
 
				             new_type = quantized_type;
			
 
				 #ifdef GGML_USE_K_QUANTS
			
@@ -3090,17 +3105,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				                 int nx = tensor.ne.at(0);
			
 
				                 int ny = tensor.ne.at(1);
			
 
				                 if (nx % QK_K != 0 || ny % QK_K != 0) {
			
 
				-                    fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
			
 
				+                    LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
			
 
				                     convert_incompatible_tensor = true;
			
 
				                 }
			
 
				             }
			
 
				             if (convert_incompatible_tensor) {
			
 
				                 if (tensor.name == "output.weight") {
			
 
				                     new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
			
 
				-                    fprintf(stderr, "F16 will be used for this tensor instead.\n");
			
 
				+                    LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
			
 
				                 } else if (tensor.name == "tok_embeddings.weight") {
			
 
				                     new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
			
 
				-                    fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
			
 
				+                    LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
			
 
				                 } else {
			
 
				                     throw std::runtime_error("Unsupported tensor size encountered\n");
			
 
				                 }
			
@@ -3120,7 +3135,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				                 f32_data = (float *) f32_conv_buf.addr;
			
 
				             }
			
 
				 
			
 
				-            printf("quantizing to %s .. ", ggml_type_name(new_type));
			
 
				+            LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
			
 
				             fflush(stdout);
			
 
				 
			
 
				             work.resize(nelements * 4); // upper bound on size
			
@@ -3170,7 +3185,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				                 }
			
 
				             }
			
 
				 
			
 
				-            printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
			
 
				+            LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
			
 
				             int64_t tot_count = 0;
			
 
				             for (size_t i = 0; i < hist_cur.size(); i++) {
			
 
				                 hist_all[i] += hist_cur[i];
			
@@ -3179,18 +3194,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				 
			
 
				             if (tot_count > 0) {
			
 
				                 for (size_t i = 0; i < hist_cur.size(); i++) {
			
 
				-                    printf("%5.3f ", hist_cur[i] / float(nelements));
			
 
				+                    LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
			
 
				                 }
			
 
				             }
			
 
				-            printf("\n");
			
 
				+            LLAMA_LOG_INFO("\n");
			
 
				         }
			
 
				         total_size_org += tensor.size;
			
 
				         total_size_new += new_size;
			
 
				         file_saver.write_tensor(tensor, new_type, new_data, new_size);
			
 
				     }
			
 
				 
			
 
				-    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
			
 
				-    printf("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
			
 
				+    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
			
 
				+    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
			
 
				 
			
 
				     {
			
 
				         int64_t sum_all = 0;
			
@@ -3199,11 +3214,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				         }
			
 
				 
			
 
				         if (sum_all > 0) {
			
 
				-            printf("%s: hist: ", __func__);
			
 
				+            LLAMA_LOG_INFO("%s: hist: ", __func__);
			
 
				             for (size_t i = 0; i < hist_all.size(); i++) {
			
 
				-                printf("%5.3f ", hist_all[i] / float(sum_all));
			
 
				+                LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
			
 
				             }
			
 
				-            printf("\n");
			
 
				+            LLAMA_LOG_INFO("\n");
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -3227,8 +3242,8 @@ struct llama_model * llama_load_model_from_file(
 
				                 params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
			
 
				                 memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
			
 
				                 params.progress_callback_user_data)) {
			
 
				+        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
			
 
				         delete model;
			
 
				-        fprintf(stderr, "%s: failed to load model\n", __func__);
			
 
				         return nullptr;
			
 
				     }
			
 
				 
			
@@ -3261,10 +3276,9 @@ struct llama_context * llama_new_context_with_model(
 
				             unsigned percentage = (unsigned) (100 * progress);
			
 
				             while (percentage > *cur_percentage_p) {
			
 
				                 *cur_percentage_p = percentage;
			
 
				-                fprintf(stderr, ".");
			
 
				-                fflush(stderr);
			
 
				+                LLAMA_LOG_INFO(".");
			
 
				                 if (percentage >= 100) {
			
 
				-                    fprintf(stderr, "\n");
			
 
				+                    LLAMA_LOG_INFO("\n");
			
 
				                 }
			
 
				             }
			
 
				         };
			
@@ -3278,14 +3292,14 @@ struct llama_context * llama_new_context_with_model(
 
				     // reserve memory for context buffers
			
 
				     if (!params.vocab_only) {
			
 
				         if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
			
 
				-            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
			
 
				+            LLAMA_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
			
 
				             llama_free(ctx);
			
 
				             return nullptr;
			
 
				         }
			
 
				 
			
 
				         {
			
 
				             const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
			
 
				-            fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
			
 
				         }
			
 
				 
			
 
				         const auto & hparams = ctx->model.hparams;
			
@@ -3319,14 +3333,14 @@ struct llama_context * llama_new_context_with_model(
 
				             // measure memory requirements for the graph
			
 
				             size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
			
 
				 
			
 
				-            fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
			
 
				 
			
 
				             // debug - for comparison with scratch buffer
			
 
				             //size_t prev_req =
			
 
				             //    MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
			
 
				             //    MEM_REQ_SCRATCH1().at(ctx->model.type) +
			
 
				             //    MEM_REQ_EVAL().at(ctx->model.type);
			
 
				-            //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
			
 
				+            //LLAMA_LOG_INFO("%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
			
 
				 
			
 
				             // recreate allocator with exact memory requirements
			
 
				             ggml_allocr_free(ctx->alloc);
			
@@ -3362,13 +3376,13 @@ struct llama_context * llama_new_context_with_model(
 
				 
			
 
				         const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
			
 
				 
			
 
				-        fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
			
 
				+        LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
			
 
				 
			
 
				-#define LLAMA_METAL_CHECK_BUF(result)                                          \
			
 
				-    if (!(result)) {                                                           \
			
 
				-        fprintf(stderr, "%s: failed to add buffer\n", __func__);               \
			
 
				-        llama_free(ctx);                                                       \
			
 
				-        return NULL;                                                           \
			
 
				+#define LLAMA_METAL_CHECK_BUF(result)                            \
			
 
				+    if (!(result)) {                                             \
			
 
				+        LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
			
 
				+        llama_free(ctx);                                         \
			
 
				+        return NULL;                                             \
			
 
				     }
			
 
				 
			
 
				         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
			
@@ -3422,19 +3436,19 @@ int llama_model_quantize(
 
				         llama_model_quantize_internal(fname_inp, fname_out, params);
			
 
				         return 0;
			
 
				     } catch (const std::exception & err) {
			
 
				-        fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
			
 
				+        LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
			
 
				         return 1;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
			
 
				-    fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
			
 
				+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
			
 
				 
			
 
				     const int64_t t_start_lora_us = ggml_time_us();
			
 
				 
			
 
				     auto fin = std::ifstream(path_lora, std::ios::binary);
			
 
				     if (!fin) {
			
 
				-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
			
 
				+        LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -3443,14 +3457,14 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				         uint32_t magic;
			
 
				         fin.read((char *) &magic, sizeof(magic));
			
 
				         if (magic != LLAMA_FILE_MAGIC_GGLA) {
			
 
				-            fprintf(stderr, "%s: bad file magic\n", __func__);
			
 
				+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
			
 
				             return 1;
			
 
				         }
			
 
				         uint32_t format_version;
			
 
				         fin.read((char *) &format_version, sizeof(format_version));
			
 
				 
			
 
				         if (format_version != 1) {
			
 
				-            fprintf(stderr, "%s: unsupported file version\n", __func__ );
			
 
				+            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
			
 
				             return 1;
			
 
				         }
			
 
				     }
			
@@ -3461,7 +3475,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				     fin.read((char *) &lora_alpha, sizeof(lora_alpha));
			
 
				     float scaling = (float)lora_alpha / (float)lora_r;
			
 
				 
			
 
				-    fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
			
 
				+    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
			
 
				 
			
 
				 
			
 
				     // create a temporary ggml context to store the lora tensors
			
@@ -3487,7 +3501,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				     ggml_context * base_ctx = NULL;
			
 
				     llama_buffer base_buf;
			
 
				     if (path_base_model) {
			
 
				-        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
			
 
				+        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
			
 
				         model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
			
 
				 
			
 
				         size_t ctx_size;
			
@@ -3544,17 +3558,17 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				         const std::string lora_suffix = ".lora";
			
 
				         size_t pos = name.rfind(lora_suffix);
			
 
				         if (pos == std::string::npos) {
			
 
				-            fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
			
 
				+            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
			
 
				             return 1;
			
 
				         }
			
 
				 
			
 
				         std::string lora_type = name.substr(pos + lora_suffix.length());
			
 
				         std::string base_name = name;
			
 
				         base_name.erase(pos);
			
 
				-        // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
			
 
				+        // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
			
 
				 
			
 
				         if (model_tensors.find(base_name) == model_tensors.end()) {
			
 
				-            fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
			
 
				+            LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
			
 
				             return 1;
			
 
				         }
			
 
				 
			
@@ -3565,7 +3579,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				             case 1: wtype = GGML_TYPE_F16;  break;
			
 
				             default:
			
 
				                     {
			
 
				-                        fprintf(stderr, "%s: invalid tensor data type '%d'\n",
			
 
				+                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
			
 
				                                 __func__, ftype);
			
 
				                         return false;
			
 
				                     }
			
@@ -3575,7 +3589,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				             lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
			
 
				         }
			
 
				         else {
			
 
				-            fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
			
 
				+            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
			
 
				             return 1;
			
 
				         }
			
 
				         ggml_set_name(lora_tensor, "lora_tensor");
			
@@ -3613,7 +3627,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				             if (model_loader) {
			
 
				                 // load from base model
			
 
				                 if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
			
 
				-                    fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
			
 
				+                    LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
			
 
				                     return 1;
			
 
				                 }
			
 
				                 size_t idx = model_loader->tensors_map.name_to_idx[base_name];
			
@@ -3629,8 +3643,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				 
			
 
				             if (ggml_is_quantized(base_t->type)) {
			
 
				                 if (!warned) {
			
 
				-                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
			
 
				-                                    "use a f16 or f32 base model with --lora-base\n", __func__);
			
 
				+                    LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
			
 
				+                                   "use a f16 or f32 base model with --lora-base\n", __func__);
			
 
				                     warned = true;
			
 
				                 }
			
 
				             }
			
@@ -3644,8 +3658,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				             ggml_set_name(loraB, "loraB");
			
 
				 
			
 
				             if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
			
 
				-                fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
			
 
				-                               " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
			
 
				+                LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
			
 
				+                                " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
			
 
				                 return 1;
			
 
				             }
			
 
				 
			
@@ -3690,7 +3704,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				 
			
 
				             n_tensors++;
			
 
				             if (n_tensors % 4 == 0) {
			
 
				-                fprintf(stderr, ".");
			
 
				+                LLAMA_LOG_INFO(".");
			
 
				             }
			
 
				         }
			
 
				     }
			
@@ -3702,7 +3716,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
				     }
			
 
				 
			
 
				     const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
			
 
				-    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
			
 
				+    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
			
 
				 
			
 
				     return 0;
			
 
				 }
			
@@ -3711,7 +3725,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
 
				     try {
			
 
				         return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
			
 
				     } catch (const std::exception & err) {
			
 
				-        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
			
 
				+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
			
 
				         return 1;
			
 
				     }
			
 
				 }
			
@@ -3720,7 +3734,7 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
 
				     try {
			
 
				         return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
			
 
				     } catch (const std::exception & err) {
			
 
				-        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
			
 
				+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
			
 
				         return 1;
			
 
				     }
			
 
				 }
			
@@ -3769,10 +3783,20 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
 
				     return s_total;
			
 
				 }
			
 
				 
			
 
				-// Copies the state to the specified destination address
			
 
				-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
			
 
				-    uint8_t * out = dst;
			
 
				-
			
 
				+/** copy state data into either a buffer or file depending on the passed in context
			
 
				+ *
			
 
				+ * file context:
			
 
				+ * llama_file file("/path", "wb");
			
 
				+ * llama_data_file_context data_ctx(&file);
			
 
				+ * llama_copy_state_data(ctx, &data_ctx);
			
 
				+ *
			
 
				+ * buffer context:
			
 
				+ * std::vector<uint8_t> buf(max_size, 0);
			
 
				+ * llama_data_buffer_context data_ctx(&buf.data());
			
 
				+ * llama_copy_state_data(ctx, &data_ctx);
			
 
				+ *
			
 
				+*/
			
 
				+void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
			
 
				     // copy rng
			
 
				     {
			
 
				         std::stringstream rng_ss;
			
@@ -3784,8 +3808,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
				         memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
			
 
				         memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
			
 
				 
			
 
				-        memcpy(out, &rng_size,   sizeof(rng_size));    out += sizeof(rng_size);
			
 
				-        memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
			
 
				+        data_ctx->write(&rng_size,   sizeof(rng_size));
			
 
				+        data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
			
 
				     }
			
 
				 
			
 
				     // copy logits
			
@@ -3793,25 +3817,29 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
				         const size_t logits_cap  = ctx->logits.capacity();
			
 
				         const size_t logits_size = ctx->logits.size();
			
 
				 
			
 
				-        memcpy(out, &logits_cap,  sizeof(logits_cap));  out += sizeof(logits_cap);
			
 
				-        memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
			
 
				+        data_ctx->write(&logits_cap,  sizeof(logits_cap));
			
 
				+        data_ctx->write(&logits_size, sizeof(logits_size));
			
 
				 
			
 
				         if (logits_size) {
			
 
				-            memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
			
 
				+            data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
			
 
				         }
			
 
				 
			
 
				-        out += logits_cap * sizeof(float);
			
 
				+        // If there is a gap between the size and the capacity, write padding
			
 
				+        size_t padding_size = (logits_cap - logits_size) * sizeof(float);
			
 
				+        if (padding_size > 0) {
			
 
				+            std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
			
 
				+            data_ctx->write(padding.data(), padding_size);
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     // copy embeddings
			
 
				     {
			
 
				         const size_t embedding_size = ctx->embedding.size();
			
 
				 
			
 
				-        memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
			
 
				+        data_ctx->write(&embedding_size, sizeof(embedding_size));
			
 
				 
			
 
				         if (embedding_size) {
			
 
				-            memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
			
 
				-            out += embedding_size * sizeof(float);
			
 
				+            data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -3826,8 +3854,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
				         const size_t kv_size = kv_self.buf.size;
			
 
				         const int    kv_ntok = llama_get_kv_cache_token_count(ctx);
			
 
				 
			
 
				-        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
			
 
				-        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
			
 
				+        data_ctx->write(&kv_size, sizeof(kv_size));
			
 
				+        data_ctx->write(&kv_ntok, sizeof(kv_ntok));
			
 
				 
			
 
				         if (kv_size) {
			
 
				             const size_t elt_size = ggml_element_size(kv_self.k);
			
@@ -3836,12 +3864,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
				             ggml_cgraph gf{};
			
 
				 
			
 
				             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
			
 
				-            kout3d->data = out;
			
 
				-            out += ggml_nbytes(kout3d);
			
 
				+            std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
			
 
				+            kout3d->data = kout3d_data.data();
			
 
				 
			
 
				             ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
			
 
				-            vout3d->data = out;
			
 
				-            out += ggml_nbytes(vout3d);
			
 
				+            std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
			
 
				+            vout3d->data = vout3d_data.data();
			
 
				 
			
 
				             ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
			
 
				                 n_embd, kv_ntok, n_layer,
			
@@ -3856,15 +3884,20 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
				             ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
			
 
				 
			
 
				             ggml_free(cpy_ctx);
			
 
				+
			
 
				+            // our data is now in the kout3d_data and vout3d_data buffers
			
 
				+            // write them to file
			
 
				+            data_ctx->write(kout3d_data.data(), kout3d_data.size());
			
 
				+            data_ctx->write(vout3d_data.data(), vout3d_data.size());
			
 
				         }
			
 
				     }
			
 
				+}
			
 
				 
			
 
				-    const size_t written  = out - dst;
			
 
				-    const size_t max_size = llama_get_state_size(ctx);
			
 
				-
			
 
				-    LLAMA_ASSERT(written <= max_size);
			
 
				+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
			
 
				+    llama_data_buffer_context data_ctx(dst);
			
 
				+    llama_copy_state_data_internal(ctx, &data_ctx);
			
 
				 
			
 
				-    return written;
			
 
				+    return data_ctx.get_size_written();
			
 
				 }
			
 
				 
			
 
				 // Sets the state reading from the specified source address
			
@@ -3983,7 +4016,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
 
				         const uint32_t version = file.read_u32();
			
 
				 
			
 
				         if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
			
 
				-            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
			
 
				+            LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
			
 
				             return false;
			
 
				         }
			
 
				 
			
@@ -3991,7 +4024,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
 
				         file.read_raw(&session_hparams, sizeof(llama_hparams));
			
 
				 
			
 
				         if (session_hparams != ctx->model.hparams) {
			
 
				-            fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
			
 
				+            LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
			
 
				             return false;
			
 
				         }
			
 
				     }
			
@@ -4001,7 +4034,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
 
				         const uint32_t n_token_count = file.read_u32();
			
 
				 
			
 
				         if (n_token_count > n_token_capacity) {
			
 
				-            fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
			
 
				+            LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
			
 
				             return false;
			
 
				         }
			
 
				 
			
@@ -4015,7 +4048,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
 
				         const size_t n_state_size_max = llama_get_state_size(ctx);
			
 
				 
			
 
				         if (n_state_size_cur > n_state_size_max) {
			
 
				-            fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
			
 
				+            LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
			
 
				             return false;
			
 
				         }
			
 
				 
			
@@ -4032,7 +4065,7 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
 
				     try {
			
 
				         return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
			
 
				     } catch (const std::exception & err) {
			
 
				-        fprintf(stderr, "error loading session file: %s\n", err.what());
			
 
				+        LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
			
 
				         return false;
			
 
				     }
			
 
				 }
			
@@ -4049,15 +4082,9 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
 
				     file.write_u32((uint32_t) n_token_count);
			
 
				     file.write_raw(tokens, sizeof(llama_token) * n_token_count);
			
 
				 
			
 
				-    // save the context state
			
 
				-    {
			
 
				-        const size_t n_state_size_max = llama_get_state_size(ctx);
			
 
				-
			
 
				-        std::vector<uint8_t> state_data(n_state_size_max);
			
 
				-        const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data());
			
 
				-
			
 
				-        file.write_raw(state_data.data(), n_state_size_cur);
			
 
				-    }
			
 
				+    // save the context state using stream saving
			
 
				+    llama_data_file_context data_ctx(&file);
			
 
				+    llama_copy_state_data_internal(ctx, &data_ctx);
			
 
				 
			
 
				     return true;
			
 
				 }
			
@@ -4069,7 +4096,7 @@ int llama_eval(
 
				                          int   n_past,
			
 
				                          int   n_threads) {
			
 
				     if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
			
 
				-        fprintf(stderr, "%s: failed to eval\n", __func__);
			
 
				+        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -4091,7 +4118,7 @@ int llama_eval_embd(
 
				                              int   n_past,
			
 
				                              int   n_threads) {
			
 
				     if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
			
 
				-        fprintf(stderr, "%s: failed to eval\n", __func__);
			
 
				+        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -4112,7 +4139,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
 
				     const std::vector<llama_token> tmp(n_batch, llama_token_bos());
			
 
				 
			
 
				     if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
			
 
				-        fprintf(stderr, "%s: failed to eval\n", __func__);
			
 
				+        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
@@ -4128,7 +4155,7 @@ int llama_tokenize_with_model(
 
				     auto res = llama_tokenize(model->vocab, text, add_bos);
			
 
				 
			
 
				     if (n_max_tokens < (int) res.size()) {
			
 
				-        fprintf(stderr, "%s: too many tokens\n", __func__);
			
 
				+        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
			
 
				         return -((int) res.size());
			
 
				     }
			
 
				 
			
@@ -4245,15 +4272,15 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
 
				 void llama_print_timings(struct llama_context * ctx) {
			
 
				     const llama_timings timings = llama_get_timings(ctx);
			
 
				 
			
 
				-    fprintf(stderr, "\n");
			
 
				-    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, timings.t_load_ms);
			
 
				-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				+    LLAMA_LOG_INFO("\n");
			
 
				+    LLAMA_LOG_INFO("%s:        load time = %8.2f ms\n", __func__, timings.t_load_ms);
			
 
				+    LLAMA_LOG_INFO("%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				             __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
			
 
				-    fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				+    LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				             __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
			
 
				-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				+    LLAMA_LOG_INFO("%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
			
 
				             __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
			
 
				-    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
			
 
				+    LLAMA_LOG_INFO("%s:       total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
			
 
				 }
			
 
				 
			
 
				 void llama_reset_timings(struct llama_context * ctx) {
			
@@ -4289,3 +4316,44 @@ const char * llama_print_system_info(void) {
 
				 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
			
 
				     return ctx->model.tensors_by_name;
			
 
				 }
			
 
				+
			
 
				+
			
 
				+void llama_log_set(llama_log_callback log_callback, void * user_data) {
			
 
				+    g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
			
 
				+    g_state.log_callback_user_data = user_data;
			
 
				+}
			
 
				+
			
 
				+#if defined(_MSC_VER) && !defined(vsnprintf)
			
 
				+#define vsnprintf _vsnprintf
			
 
				+#endif
			
 
				+
			
 
				+static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
			
 
				+    va_list args_copy;
			
 
				+    va_copy(args_copy, args);
			
 
				+    char buffer[128];
			
 
				+    int len = vsnprintf(buffer, 128, format, args);
			
 
				+    if (len < 128) {
			
 
				+        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
			
 
				+    } else {
			
 
				+        char* buffer2 = new char[len+1];
			
 
				+        vsnprintf(buffer2, len+1, format, args_copy);
			
 
				+        buffer2[len] = 0;
			
 
				+        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
			
 
				+        delete[] buffer2;
			
 
				+    }
			
 
				+    va_end(args_copy);
			
 
				+}
			
 
				+
			
 
				+static void llama_log_internal(llama_log_level level, const char * format, ...) {
			
 
				+    va_list args;
			
 
				+    va_start(args, format);
			
 
				+    llama_log_internal_v(level, format, args);
			
 
				+    va_end(args);
			
 
				+}
			
 
				+
			
 
				+static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
			
 
				+    (void) level;
			
 
				+    (void) user_data;
			
 
				+    fputs(text, stderr);
			
 
				+    fflush(stderr);
			
 
				+}
			
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -1,8 +1,9 @@
 
				 package llm
			
 
				 
			
 
				 /*
			
 
				-#cgo CPPFLAGS: -O3 -Wall -Wextra -Wno-unused-function -Wno-unused-variable -DNDEBUG -DGGML_USE_K_QUANTS
			
 
				-#cgo CXXFLAGS: -std=gnu++11
			
 
				+#cgo CFLAGS: -Ofast -std=c11 -fPIC
			
 
				+#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -DNDEBUG -DGGML_USE_K_QUANTS
			
 
				+#cgo CXXFLAGS: -std=c++11 -fPIC
			
 
				 #cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
			
 
				 #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
			
 
				 #cgo darwin LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
			
--- a/llm/llama.h
+++ b/llm/llama.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
			
 
				+ * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -112,7 +112,20 @@ extern "C" {
 
				 
			
 
				     typedef void (*llama_progress_callback)(float progress, void *ctx);
			
 
				 
			
 
				-   struct llama_context_params {
			
 
				+    enum llama_log_level {
			
 
				+        LLAMA_LOG_LEVEL_ERROR = 2,
			
 
				+        LLAMA_LOG_LEVEL_WARN  = 3,
			
 
				+        LLAMA_LOG_LEVEL_INFO  = 4
			
 
				+    };
			
 
				+
			
 
				+    // Signature for logging events
			
 
				+    // Note that text includes the new line character at the end for most events.
			
 
				+    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
			
 
				+    // if it exists.
			
 
				+    // It might not exist for progress report where '.' is output repeatedly.
			
 
				+    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
			
 
				+
			
 
				+    struct llama_context_params {
			
 
				         uint32_t seed;         // RNG seed, -1 for random
			
 
				         int32_t  n_ctx;        // text context
			
 
				         int32_t  n_batch;      // prompt processing batch size
			
@@ -221,6 +234,10 @@ extern "C" {
 
				         int32_t n_eval;
			
 
				     };
			
 
				 
			
 
				+    // Set callback for all future logging events.
			
 
				+    // If this is not called, or NULL is supplied, everything is output on stderr.
			
 
				+    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
			
 
				+
			
 
				     LLAMA_API int llama_max_devices();
			
 
				 
			
 
				     LLAMA_API struct llama_context_params llama_context_default_params();