1 rok temu · b98096389d
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -0,0 +1,567 @@
 
															+/**
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															+ *
														
 
															+ * MIT License
														
 
															+ *
														
 
															+ * Copyright (c) 2023 Georgi Gerganov
														
 
															+ *
														
 
															+ * Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+ * of this software and associated documentation files (the "Software"), to deal
														
 
															+ * in the Software without restriction, including without limitation the rights
														
 
															+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+ * copies of the Software, and to permit persons to whom the Software is
														
 
															+ * furnished to do so, subject to the following conditions:
														
 
															+ *
														
 
															+ * The above copyright notice and this permission notice shall be included in all
														
 
															+ * copies or substantial portions of the Software.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ */
														
 
															+
														
 
															+#include "ggml-alloc.h"
														
 
															+#include "ggml.h"
														
 
															+#include <assert.h>
														
 
															+#include <stdarg.h>
														
 
															+#include <stdio.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <string.h>
														
 
															+
														
 
															+#define UNUSED(x) (void)(x)
														
 
															+#define MAX(a, b) ((a) > (b) ? (a) : (b))
														
 
															+
														
 
															+//#define GGML_ALLOCATOR_DEBUG
														
 
															+
														
 
															+//#define AT_PRINTF printf
														
 
															+#define AT_PRINTF(...) ((void)0)
														
 
															+
														
 
															+struct hash_node {
														
 
															+    struct ggml_tensor * t;
														
 
															+    int n_children;
														
 
															+    int n_views;
														
 
															+};
														
 
															+
														
 
															+static size_t hash(void * p) {
														
 
															+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
														
 
															+}
														
 
															+
														
 
															+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
														
 
															+    size_t h = hash(t);
														
 
															+
														
 
															+    // linear probing
														
 
															+    size_t i = h;
														
 
															+    while (hash_table[i].t != NULL) {
														
 
															+        if (hash_table[i].t == t) {
														
 
															+            return &hash_table[i];
														
 
															+        }
														
 
															+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
														
 
															+        if (i == h) {
														
 
															+            // hash table is full
														
 
															+            GGML_ASSERT(false);
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    hash_table[i].t = t;
														
 
															+    return &hash_table[i];
														
 
															+}
														
 
															+
														
 
															+// TODO: GGML_PAD ?
														
 
															+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
														
 
															+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
														
 
															+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
														
 
															+    return offset + align;
														
 
															+}
														
 
															+
														
 
															+struct free_block {
														
 
															+    void * addr;
														
 
															+    size_t size;
														
 
															+};
														
 
															+
														
 
															+#define MAX_FREE_BLOCKS 128
														
 
															+
														
 
															+struct ggml_allocr {
														
 
															+    void * data;
														
 
															+    size_t size;
														
 
															+    size_t alignment;
														
 
															+    int n_free_blocks;
														
 
															+    struct free_block free_blocks[MAX_FREE_BLOCKS];
														
 
															+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
														
 
															+    size_t max_size;
														
 
															+    bool measure;
														
 
															+
														
 
															+#ifdef GGML_ALLOCATOR_DEBUG
														
 
															+    struct ggml_tensor * allocated_tensors[1024];
														
 
															+#endif
														
 
															+};
														
 
															+
														
 
															+#ifdef GGML_ALLOCATOR_DEBUG
														
 
															+static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
														
 
															+    for (int i = 0; i < 1024; i++) {
														
 
															+        if (alloc->allocated_tensors[i] == NULL) {
														
 
															+            alloc->allocated_tensors[i] = tensor;
														
 
															+            return;
														
 
															+        }
														
 
															+    }
														
 
															+    GGML_ASSERT(!"out of allocated_tensors");
														
 
															+}
														
 
															+static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
														
 
															+    for (int i = 0; i < 1024; i++) {
														
 
															+        if (alloc->allocated_tensors[i] == tensor ||
														
 
															+            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
														
 
															+            alloc->allocated_tensors[i] = NULL;
														
 
															+            return;
														
 
															+        }
														
 
															+    }
														
 
															+    printf("tried to free tensor %s not found\n", tensor->name);
														
 
															+    GGML_ASSERT(!"tensor not found");
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+
														
 
															+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
														
 
															+    return ggml_nbytes(tensor);
														
 
															+
														
 
															+    UNUSED(alloc);
														
 
															+}
														
 
															+
														
 
															+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
														
 
															+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
														
 
															+    size = aligned_offset(NULL, size, alloc->alignment);
														
 
															+
														
 
															+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
														
 
															+
														
 
															+    size_t max_avail = 0;
														
 
															+
														
 
															+    // find the best fitting free block
														
 
															+    int best_fit_block = -1;
														
 
															+    size_t best_fit_size = SIZE_MAX;
														
 
															+    for (int i = 0; i < alloc->n_free_blocks; i++) {
														
 
															+        struct free_block * block = &alloc->free_blocks[i];
														
 
															+        max_avail = MAX(max_avail, block->size);
														
 
															+        if (block->size >= size && block->size <= best_fit_size) {
														
 
															+            best_fit_block = i;
														
 
															+            best_fit_size = block->size;
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    AT_PRINTF("block %d\n", best_fit_block);
														
 
															+
														
 
															+    if (best_fit_block == -1) {
														
 
															+        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
														
 
															+                __func__, size, max_avail);
														
 
															+        GGML_ASSERT(!"not enough space in the buffer");
														
 
															+        return;
														
 
															+    }
														
 
															+    struct free_block * block = &alloc->free_blocks[best_fit_block];
														
 
															+    void * addr = block->addr;
														
 
															+    block->addr = (char*)block->addr + size;
														
 
															+    block->size -= size;
														
 
															+    if (block->size == 0) {
														
 
															+        // remove block if empty
														
 
															+        alloc->n_free_blocks--;
														
 
															+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
														
 
															+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    tensor->data = addr;
														
 
															+
														
 
															+#ifdef GGML_ALLOCATOR_DEBUG
														
 
															+    add_allocated_tensor(alloc, tensor);
														
 
															+    size_t cur_max = (char*)addr - (char*)alloc->data + size;
														
 
															+    if (cur_max > alloc->max_size) {
														
 
															+        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
														
 
															+        for (int i = 0; i < 1024; i++) {
														
 
															+            if (alloc->allocated_tensors[i]) {
														
 
															+                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
														
 
															+            }
														
 
															+        }
														
 
															+        printf("\n");
														
 
															+    }
														
 
															+#endif
														
 
															+
														
 
															+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
														
 
															+}
														
 
															+
														
 
															+// this is a very naive implementation, but for our case the number of free blocks should be very small
														
 
															+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
														
 
															+    void * ptr = tensor->data;
														
 
															+
														
 
															+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
														
 
															+        // the tensor was not allocated in this buffer
														
 
															+        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
														
 
															+        // the easiest way to deal with this is just to ignore it
														
 
															+        return;
														
 
															+    }
														
 
															+
														
 
															+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
														
 
															+    size = aligned_offset(NULL, size, alloc->alignment);
														
 
															+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
														
 
															+
														
 
															+#ifdef GGML_ALLOCATOR_DEBUG
														
 
															+    remove_allocated_tensor(alloc, tensor);
														
 
															+#endif
														
 
															+
														
 
															+    // see if we can merge with an existing block
														
 
															+    for (int i = 0; i < alloc->n_free_blocks; i++) {
														
 
															+        struct free_block * block = &alloc->free_blocks[i];
														
 
															+        // check if ptr is at the end of the block
														
 
															+        if ((char*)block->addr + block->size == ptr) {
														
 
															+            block->size += size;
														
 
															+            // check if we can merge with the next block
														
 
															+            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
														
 
															+                block->size += alloc->free_blocks[i+1].size;
														
 
															+                alloc->n_free_blocks--;
														
 
															+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
														
 
															+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
														
 
															+                }
														
 
															+            }
														
 
															+            return;
														
 
															+        }
														
 
															+        // check if ptr is at the beginning of the block
														
 
															+        if ((char*)ptr + size == block->addr) {
														
 
															+            block->addr = ptr;
														
 
															+            block->size += size;
														
 
															+            // check if we can merge with the previous block
														
 
															+            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
														
 
															+                alloc->free_blocks[i-1].size += block->size;
														
 
															+                alloc->n_free_blocks--;
														
 
															+                for (int j = i; j < alloc->n_free_blocks; j++) {
														
 
															+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
														
 
															+                }
														
 
															+            }
														
 
															+            return;
														
 
															+        }
														
 
															+    }
														
 
															+    // otherwise, add a new block
														
 
															+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
														
 
															+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
														
 
															+    int insert_pos = 0;
														
 
															+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
														
 
															+        insert_pos++;
														
 
															+    }
														
 
															+    // shift all blocks from insert_pos onward to make room for the new block
														
 
															+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
														
 
															+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
														
 
															+    }
														
 
															+    // insert the new block
														
 
															+    alloc->free_blocks[insert_pos].addr = ptr;
														
 
															+    alloc->free_blocks[insert_pos].size = size;
														
 
															+    alloc->n_free_blocks++;
														
 
															+}
														
 
															+
														
 
															+void ggml_allocr_reset(struct ggml_allocr * alloc) {
														
 
															+    alloc->n_free_blocks = 1;
														
 
															+    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
														
 
															+    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
														
 
															+    alloc->free_blocks[0].size = alloc->size - align_offset;
														
 
															+}
														
 
															+
														
 
															+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
														
 
															+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
														
 
															+
														
 
															+    *alloc = (struct ggml_allocr){
														
 
															+        /*.data          = */ data,
														
 
															+        /*.size          = */ size,
														
 
															+        /*.alignment     = */ alignment,
														
 
															+        /*.n_free_blocks = */ 0,
														
 
															+        /*.free_blocks   = */ {{0}},
														
 
															+        /*.hash_table    = */ {{0}},
														
 
															+        /*.max_size      = */ 0,
														
 
															+        /*.measure       = */ false,
														
 
															+#ifdef GGML_ALLOCATOR_DEBUG
														
 
															+        /*.allocated_tensors = */ = {0},
														
 
															+#endif
														
 
															+    };
														
 
															+
														
 
															+    ggml_allocr_reset(alloc);
														
 
															+
														
 
															+    return alloc;
														
 
															+}
														
 
															+
														
 
															+// address and size of the buffer when measuring
														
 
															+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
														
 
															+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
														
 
															+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
														
 
															+
														
 
															+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
														
 
															+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
														
 
															+
														
 
															+    *alloc = (struct ggml_allocr){
														
 
															+        /*.data          = */ MEASURE_BASE_ADDR,
														
 
															+        /*.size          = */ MEASURE_MAX_SIZE,
														
 
															+        /*.alignment     = */ alignment,
														
 
															+        /*.n_free_blocks = */ 0,
														
 
															+        /*.free_blocks   = */ {{0}},
														
 
															+        /*.hash_table    = */ {{0}},
														
 
															+        /*.max_size      = */ 0,
														
 
															+        /*.measure       = */ true,
														
 
															+#ifdef GGML_ALLOCATOR_DEBUG
														
 
															+        /*.allocated_tensors = */ = {0},
														
 
															+#endif
														
 
															+    };
														
 
															+
														
 
															+    ggml_allocr_reset(alloc);
														
 
															+
														
 
															+    return alloc;
														
 
															+}
														
 
															+
														
 
															+void ggml_allocr_free(struct ggml_allocr * alloc) {
														
 
															+    free(alloc);
														
 
															+}
														
 
															+
														
 
															+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
														
 
															+    return alloc->measure;
														
 
															+}
														
 
															+
														
 
															+//////////// compute graph allocator
														
 
															+
														
 
															+static bool ggml_is_view(struct ggml_tensor * t) {
														
 
															+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
														
 
															+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
														
 
															+}
														
 
															+
														
 
															+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
														
 
															+    if (a->type != b->type) {
														
 
															+        return false;
														
 
															+    }
														
 
															+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
														
 
															+        if (a->ne[i] != b->ne[i]) {
														
 
															+            return false;
														
 
															+        }
														
 
															+        if (a->nb[i] != b->nb[i]) {
														
 
															+            return false;
														
 
															+        }
														
 
															+    }
														
 
															+    return true;
														
 
															+}
														
 
															+
														
 
															+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
														
 
															+    switch (t->op) {
														
 
															+        case GGML_OP_PERMUTE:
														
 
															+        case GGML_OP_RESHAPE:
														
 
															+        case GGML_OP_TRANSPOSE:
														
 
															+        case GGML_OP_VIEW:
														
 
															+            return t->src[0];
														
 
															+        case GGML_OP_CPY:
														
 
															+            return t->src[1];
														
 
															+        default:
														
 
															+            return NULL;
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
														
 
															+    struct ggml_tensor * parent = t;
														
 
															+    do {
														
 
															+        parent = get_view_parent(parent);
														
 
															+    } while (ggml_is_view(parent));
														
 
															+    return parent;
														
 
															+}
														
 
															+
														
 
															+static bool ggml_op_can_inplace(enum ggml_op op) {
														
 
															+    switch (op) {
														
 
															+        case GGML_OP_SCALE:
														
 
															+        case GGML_OP_DIAG_MASK_ZERO:
														
 
															+        case GGML_OP_DIAG_MASK_INF:
														
 
															+        case GGML_OP_ADD:
														
 
															+        case GGML_OP_ADD1:
														
 
															+        case GGML_OP_ACC:
														
 
															+        case GGML_OP_SUB:
														
 
															+        case GGML_OP_MUL:
														
 
															+        case GGML_OP_DIV:
														
 
															+        case GGML_OP_SQR:
														
 
															+        case GGML_OP_SQRT:
														
 
															+        case GGML_OP_LOG:
														
 
															+        case GGML_OP_UNARY:
														
 
															+        case GGML_OP_ROPE:
														
 
															+        case GGML_OP_RMS_NORM:
														
 
															+        case GGML_OP_SET:
														
 
															+        case GGML_OP_SOFT_MAX:
														
 
															+        case GGML_OP_CONT:
														
 
															+            return true;
														
 
															+
														
 
															+        default:
														
 
															+            return false;
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
														
 
															+    struct hash_node * ht = alloc->hash_table;
														
 
															+    if (node->data == NULL) {
														
 
															+        if (ggml_is_view(node)) {
														
 
															+            size_t offset;
														
 
															+            switch(node->op) {
														
 
															+                case GGML_OP_VIEW:
														
 
															+                    memcpy(&offset, node->op_params, sizeof(size_t));
														
 
															+                    node->data = (char *) node->src[0]->data + offset;
														
 
															+                    break;
														
 
															+                case GGML_OP_PERMUTE:
														
 
															+                case GGML_OP_RESHAPE:
														
 
															+                case GGML_OP_TRANSPOSE:
														
 
															+                    node->data = node->src[0]->data;
														
 
															+                    break;
														
 
															+                case GGML_OP_CPY:
														
 
															+                    node->data = node->src[1]->data;
														
 
															+                    break;
														
 
															+                default:
														
 
															+                    GGML_ASSERT(!"unknown view op");
														
 
															+                    break;
														
 
															+            }
														
 
															+        } else {
														
 
															+            // see if we can reuse a parent's buffer (inplace)
														
 
															+            if (ggml_op_can_inplace(node->op)) {
														
 
															+                for (int i = 0; i < GGML_MAX_SRC; i++) {
														
 
															+                    struct ggml_tensor * parent = node->src[i];
														
 
															+                    if (parent == NULL) {
														
 
															+                        break;
														
 
															+                    }
														
 
															+                    struct hash_node * p_hn = hash_get(ht, parent);
														
 
															+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
														
 
															+                        if (ggml_is_view(parent)) {
														
 
															+                            struct ggml_tensor * view_src = get_view_source(parent);
														
 
															+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
														
 
															+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
														
 
															+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
														
 
															+                                // the parent's data that it will need later (same layout requirement). the problem is that then
														
 
															+                                // we cannot free the tensor because the original address of the allocation is lost.
														
 
															+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
														
 
															+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
														
 
															+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
														
 
															+                                node->data = parent->data;
														
 
															+                                return;
														
 
															+                            }
														
 
															+                        }
														
 
															+                        else {
														
 
															+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
														
 
															+                            node->data = parent->data;
														
 
															+                        }
														
 
															+                        return;
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+            ggml_allocr_alloc(alloc, node);
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+static size_t ggml_allocator_alloc_graph_tensors_n(
														
 
															+    struct ggml_allocr * alloc,
														
 
															+    struct ggml_cgraph ** graphs, int n_graphs,
														
 
															+    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
														
 
															+
														
 
															+    // reset hash table
														
 
															+    struct hash_node * ht = alloc->hash_table;
														
 
															+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
														
 
															+
														
 
															+    // count number of children and views
														
 
															+    for (int g = 0; g < n_graphs; g++) {
														
 
															+        struct ggml_cgraph * gf = graphs[g];
														
 
															+        for (int i = 0; i < gf->n_nodes; i++) {
														
 
															+            struct ggml_tensor * node = gf->nodes[i];
														
 
															+
														
 
															+            if (ggml_is_view(node)) {
														
 
															+                struct ggml_tensor * view_src = get_view_source(node);
														
 
															+                hash_get(ht, view_src)->n_views += 1;
														
 
															+            }
														
 
															+
														
 
															+            for (int j = 0; j < GGML_MAX_SRC; j++) {
														
 
															+                struct ggml_tensor * parent = node->src[j];
														
 
															+                if (parent == NULL) {
														
 
															+                    break;
														
 
															+                }
														
 
															+                hash_get(ht, parent)->n_children += 1;
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    // allocate tensors
														
 
															+    for (int g = 0; g < n_graphs; g++) {
														
 
															+        struct ggml_cgraph * gf = graphs[g];
														
 
															+        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
														
 
															+        // graph inputs are allocated first to ensure that they are not overwritten by each other
														
 
															+        if (inputs != NULL && inputs[g] != NULL) {
														
 
															+            for (int i = 0; inputs[g][i] != NULL; i++) {
														
 
															+                struct ggml_tensor * input = inputs[g][i];
														
 
															+                AT_PRINTF("input: %s\n", input->name);
														
 
															+                allocate_node(alloc, input);
														
 
															+            }
														
 
															+        }
														
 
															+        for (int i = 0; i < gf->n_nodes; i++) {
														
 
															+            struct ggml_tensor * node = gf->nodes[i];
														
 
															+
														
 
															+            // allocate parents (leafs)
														
 
															+            for (int j = 0; j < GGML_MAX_SRC; j++) {
														
 
															+                struct ggml_tensor * parent = node->src[j];
														
 
															+                if (parent == NULL) {
														
 
															+                    break;
														
 
															+                }
														
 
															+                allocate_node(alloc, parent);
														
 
															+            }
														
 
															+
														
 
															+            // allocate node
														
 
															+            allocate_node(alloc, node);
														
 
															+
														
 
															+            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
														
 
															+            for (int j = 0; j < GGML_MAX_SRC; j++) {
														
 
															+                struct ggml_tensor * parent = node->src[j];
														
 
															+                if (parent == NULL) {
														
 
															+                    break;
														
 
															+                }
														
 
															+                AT_PRINTF("%s", parent->name);
														
 
															+                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
														
 
															+                    AT_PRINTF(", ");
														
 
															+                }
														
 
															+            }
														
 
															+            AT_PRINTF("\n");
														
 
															+
														
 
															+            // update parents
														
 
															+            for (int j = 0; j < GGML_MAX_SRC; j++) {
														
 
															+                struct ggml_tensor * parent = node->src[j];
														
 
															+                if (parent == NULL) {
														
 
															+                    break;
														
 
															+                }
														
 
															+                struct hash_node * p_hn = hash_get(ht, parent);
														
 
															+                p_hn->n_children -= 1;
														
 
															+
														
 
															+                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
														
 
															+
														
 
															+                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
														
 
															+                    if (ggml_is_view(parent)) {
														
 
															+                        struct ggml_tensor * view_src = get_view_source(parent);
														
 
															+                        struct hash_node * view_src_hn = hash_get(ht, view_src);
														
 
															+                        view_src_hn->n_views -= 1;
														
 
															+                        AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
														
 
															+                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
														
 
															+                            ggml_allocator_free_tensor(alloc, view_src);
														
 
															+                        }
														
 
															+                    }
														
 
															+                    else {
														
 
															+                        if (parent->data != node->data) {
														
 
															+                            ggml_allocator_free_tensor(alloc, parent);
														
 
															+                        }
														
 
															+                    }
														
 
															+                }
														
 
															+            }
														
 
															+            AT_PRINTF("\n");
														
 
															+        }
														
 
															+        // free graph outputs here that wouldn't be freed otherwise because they have no children
														
 
															+        if (outputs != NULL && outputs[g] != NULL) {
														
 
															+            for (int i = 0; outputs[g][i] != NULL; i++) {
														
 
															+                struct ggml_tensor * output = outputs[g][i];
														
 
															+                AT_PRINTF("output: %s\n", output->name);
														
 
															+                ggml_allocator_free_tensor(alloc, output);
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    return alloc->max_size;
														
 
															+}
														
 
															+
														
 
															+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
														
 
															+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
														
 
															+}
														
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -0,0 +1,48 @@
 
															+/**
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															+ *
														
 
															+ * MIT License
														
 
															+ *
														
 
															+ * Copyright (c) 2023 Georgi Gerganov
														
 
															+ *
														
 
															+ * Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+ * of this software and associated documentation files (the "Software"), to deal
														
 
															+ * in the Software without restriction, including without limitation the rights
														
 
															+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+ * copies of the Software, and to permit persons to whom the Software is
														
 
															+ * furnished to do so, subject to the following conditions:
														
 
															+ *
														
 
															+ * The above copyright notice and this permission notice shall be included in all
														
 
															+ * copies or substantial portions of the Software.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ */
														
 
															+
														
 
															+#pragma once
														
 
															+
														
 
															+#include "ggml.h"
														
 
															+
														
 
															+#ifdef  __cplusplus
														
 
															+extern "C" {
														
 
															+#endif
														
 
															+
														
 
															+
														
 
															+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
														
 
															+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
														
 
															+
														
 
															+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
														
 
															+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
														
 
															+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
														
 
															+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
														
 
															+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
														
 
															+
														
 
															+
														
 
															+#ifdef  __cplusplus
														
 
															+}
														
 
															+#endif
														
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -53,6 +53,7 @@ void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 
															 void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
														
 
															 void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
														
 
															 void   ggml_cuda_set_main_device(int main_device);
														
 
															+void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
														
 
															 void   ggml_cuda_set_scratch_size(size_t scratch_size);
														
 
															 void   ggml_cuda_free_scratch(void);
														
 
															 bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
														
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -1,7 +1,7 @@
 
															 //go:build darwin
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-metal.m
+++ b/llama/ggml-metal.m
@@ -1,7 +1,7 @@
 
															 //go:build darwin
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -746,7 +746,8 @@ void ggml_metal_graph_compute(
 
															                             // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
														
 
															                             GGML_ASSERT(ne00 == ne10);
														
 
															-                            GGML_ASSERT(ne02 == ne12);
														
 
															+                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
														
 
															+                            GGML_ASSERT(ne03 == ne13);
														
 
															                             if (ggml_is_contiguous(src0) &&
														
 
															                                 ggml_is_contiguous(src1) &&
														
@@ -774,11 +775,11 @@ void ggml_metal_graph_compute(
 
															                                     initWithDevice:ctx->device transposeLeft:false transposeRight:true
														
 
															                                         resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
														
 
															-                                // we need to do ne02 multiplications
														
 
															+                                // we need to do ne12 multiplications
														
 
															                                 // TODO: is there a way to do this in parallel - currently very slow ..
														
 
															                                 // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
														
 
															-                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
														
 
															-                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
														
 
															+                                for (int64_t i02 = 0; i02 < ne12; ++i02) {
														
 
															+                                    size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
														
 
															                                     size_t offs_src1_cur = offs_src1 + i02*nb12;
														
 
															                                     size_t offs_dst_cur  = offs_dst  + i02*nb2;
														
@@ -800,8 +801,6 @@ void ggml_metal_graph_compute(
 
															                                 switch (src0t) {
														
 
															                                     case GGML_TYPE_F16:
														
 
															                                         {
														
 
															-                                            GGML_ASSERT(ne02 == ne12);
														
 
															-
														
 
															                                             nth0 = 64;
														
 
															                                             nth1 = 1;
														
 
															                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
														
@@ -881,16 +880,18 @@ void ggml_metal_graph_compute(
 
															                                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
														
 
															                                 [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
														
 
															                                 [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
														
 
															-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
														
 
															-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
														
 
															-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
														
 
															-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
														
 
															-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
														
 
															-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
														
 
															-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
														
 
															-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
														
 
															-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
														
 
															-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
														
 
															+                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
														
 
															+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
														
 
															+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
														
 
															+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
														
 
															+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
														
 
															+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
														
 
															+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
														
 
															+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
														
 
															+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
														
 
															+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
														
 
															+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
														
 
															+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
														
 
															                                 if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
														
 
															                                     src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
														
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -1,7 +1,7 @@
 
															 //go:build darwin
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -537,11 +537,13 @@ kernel void kernel_mul_mat_f16_f32(
 
															         device       float * dst,
														
 
															         constant   int64_t & ne00,
														
 
															         constant   int64_t & ne01,
														
 
															+        constant   int64_t & ne02,
														
 
															         constant  uint64_t & nb00,
														
 
															         constant  uint64_t & nb01,
														
 
															         constant  uint64_t & nb02,
														
 
															         constant   int64_t & ne10,
														
 
															         constant   int64_t & ne11,
														
 
															+        constant   int64_t & ne12,
														
 
															         constant  uint64_t & nb10,
														
 
															         constant  uint64_t & nb11,
														
 
															         constant  uint64_t & nb12,
														
@@ -557,7 +559,7 @@ kernel void kernel_mul_mat_f16_f32(
 
															     const int64_t r1 = tgpig.y;
														
 
															     const int64_t im = tgpig.z;
														
 
															-    device const half  * x = (device const half  *) (src0 + r0*nb01 + im*nb02);
														
 
															+    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
														
 
															     device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
														
 
															     sum[tpitg.x] = 0.0f;
														
@@ -580,6 +582,7 @@ kernel void kernel_mul_mat_f16_f32(
 
															     }
														
 
															 }
														
 
															+
														
 
															 kernel void kernel_alibi_f32(
														
 
															         device const float * src0,
														
 
															         device       float * dst,
														
--- a/llama/ggml-mpi.c
+++ b/llama/ggml-mpi.c
@@ -1,7 +1,7 @@
 
															 //go:build mpi
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-mpi.h
+++ b/llama/ggml-mpi.h
@@ -1,7 +1,7 @@
 
															 //go:build mpi
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-opencl.cpp
+++ b/llama/ggml-opencl.cpp
@@ -1,7 +1,7 @@
 
															 //go:build opencl
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-opencl.h
+++ b/llama/ggml-opencl.h
@@ -1,7 +1,7 @@
 
															 //go:build opencl
														
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml.c
+++ b/llama/ggml.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -4583,10 +4583,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
 
															 static struct ggml_tensor * ggml_new_tensor_impl(
														
 
															         struct ggml_context * ctx,
														
 
															-        enum   ggml_type type,
														
 
															-        int    n_dims,
														
 
															-        const int64_t* ne,
														
 
															-        void*  data) {
														
 
															+        enum   ggml_type      type,
														
 
															+        int                   n_dims,
														
 
															+        const int64_t       * ne,
														
 
															+        void                * data) {
														
 
															+
														
 
															+    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
														
 
															     size_t data_size = 0;
														
@@ -4674,22 +4676,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
 
															 struct ggml_tensor * ggml_new_tensor(
														
 
															         struct ggml_context * ctx,
														
 
															-        enum   ggml_type type,
														
 
															-        int    n_dims,
														
 
															-        const int64_t * ne) {
														
 
															+        enum   ggml_type      type,
														
 
															+        int                   n_dims,
														
 
															+        const int64_t       * ne) {
														
 
															     return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
														
 
															 }
														
 
															 struct ggml_tensor * ggml_new_tensor_1d(
														
 
															         struct ggml_context * ctx,
														
 
															-        enum   ggml_type type,
														
 
															+        enum   ggml_type      type,
														
 
															         int64_t ne0) {
														
 
															     return ggml_new_tensor(ctx, type, 1, &ne0);
														
 
															 }
														
 
															 struct ggml_tensor * ggml_new_tensor_2d(
														
 
															         struct ggml_context * ctx,
														
 
															-        enum   ggml_type type,
														
 
															+        enum   ggml_type      type,
														
 
															         int64_t ne0,
														
 
															         int64_t ne1) {
														
 
															     const int64_t ne[2] = { ne0, ne1 };
														
@@ -4698,7 +4700,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
 
															 struct ggml_tensor * ggml_new_tensor_3d(
														
 
															         struct ggml_context * ctx,
														
 
															-        enum   ggml_type type,
														
 
															+        enum   ggml_type      type,
														
 
															         int64_t ne0,
														
 
															         int64_t ne1,
														
 
															         int64_t ne2) {
														
@@ -6264,6 +6266,27 @@ struct ggml_tensor * ggml_reshape_4d(
 
															 // ggml_view_1d
														
 
															+static struct ggml_tensor * ggml_view_tensor_offset(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a,
														
 
															+        int                   n_dims,
														
 
															+        const int64_t       * ne,
														
 
															+        size_t                offset) {
														
 
															+    // don't calculate an offset from an unallocated tensor
														
 
															+    void * data = NULL;
														
 
															+    if (a->data != NULL) {
														
 
															+        data = (char *) a->data + offset;
														
 
															+    }
														
 
															+
														
 
															+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
														
 
															+
														
 
															+    ggml_format_name(result, "%s (view)", a->name);
														
 
															+
														
 
															+    ggml_set_op_params(result, &offset, sizeof(offset));
														
 
															+
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															 struct ggml_tensor * ggml_view_1d(
														
 
															         struct ggml_context * ctx,
														
 
															         struct ggml_tensor  * a,
														
@@ -6276,10 +6299,7 @@ struct ggml_tensor * ggml_view_1d(
 
															         is_node = true;
														
 
															     }
														
 
															-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
														
 
															-    ggml_format_name(result, "%s (view)", a->name);
														
 
															-
														
 
															-    ggml_set_op_params(result, &offset, sizeof(offset));
														
 
															+    struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
														
 
															     result->op   = GGML_OP_VIEW;
														
 
															     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
														
@@ -6306,10 +6326,7 @@ struct ggml_tensor * ggml_view_2d(
 
															     const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
														
 
															-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
														
 
															-    ggml_format_name(result, "%s (view)", a->name);
														
 
															-
														
 
															-    ggml_set_op_params(result, &offset, sizeof(offset));
														
 
															+    struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
														
 
															     result->nb[1] = nb1;
														
 
															     result->nb[2] = result->nb[1]*ne1;
														
@@ -6342,10 +6359,7 @@ struct ggml_tensor * ggml_view_3d(
 
															     const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
														
 
															-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
														
 
															-    ggml_format_name(result, "%s (view)", a->name);
														
 
															-
														
 
															-    ggml_set_op_params(result, &offset, sizeof(offset));
														
 
															+    struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
														
 
															     result->nb[1] = nb1;
														
 
															     result->nb[2] = nb2;
														
@@ -6380,10 +6394,7 @@ struct ggml_tensor * ggml_view_4d(
 
															     const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
														
 
															-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
														
 
															-    ggml_format_name(result, "%s (view)", a->name);
														
 
															-
														
 
															-    ggml_set_op_params(result, &offset, sizeof(offset));
														
 
															+    struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
														
 
															     result->nb[1] = nb1;
														
 
															     result->nb[2] = nb2;
														
@@ -6767,6 +6778,18 @@ struct ggml_tensor * ggml_rope_inplace(
 
															     return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
														
 
															 }
														
 
															+struct ggml_tensor * ggml_rope_custom(
														
 
															+        struct ggml_context * ctx,
														
 
															+        struct ggml_tensor  * a,
														
 
															+        int                   n_past,
														
 
															+        int                   n_dims,
														
 
															+        int                   mode,
														
 
															+        int                   n_ctx,
														
 
															+        float                 freq_base,
														
 
															+        float                 freq_scale) {
														
 
															+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
														
 
															+}
														
 
															+
														
 
															 struct ggml_tensor * ggml_rope_custom_inplace(
														
 
															         struct ggml_context * ctx,
														
 
															         struct ggml_tensor  * a,
														
--- a/llama/ggml.h
+++ b/llama/ggml.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -1196,7 +1196,18 @@ extern "C" {
 
															             int                   mode,
														
 
															             int                   n_ctx);
														
 
															-    // custom RoPE, in-place, returns view(a)
														
 
															+    // custom RoPE
														
 
															+    GGML_API struct ggml_tensor * ggml_rope_custom(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            int                   n_past,
														
 
															+            int                   n_dims,
														
 
															+            int                   mode,
														
 
															+            int                   n_ctx,
														
 
															+            float                 freq_base,
														
 
															+            float                 freq_scale);
														
 
															+
														
 
															+    // in-place, returns view(a)
														
 
															     GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
														
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * a,
														
--- a/llama/k_quants.c
+++ b/llama/k_quants.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -65,6 +65,8 @@
 
															 #define MIN(a, b) ((a) < (b) ? (a) : (b))
														
 
															 #define MAX(a, b) ((a) > (b) ? (a) : (b))
														
 
															+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
														
 
															+
														
 
															 //
														
 
															 // 2-6 bit quantization in super-blocks
														
 
															 //
														
@@ -1379,7 +1381,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
														
 
															         const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
														
 
															         const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
														
 
															-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
														
 
															+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
														
 
															         __m256i sumi = _mm256_setzero_si256();
														
@@ -1447,7 +1449,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
														
 
															         // sumf += -dmin * summs in 32bits*8
														
 
															-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
														
 
															+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
														
 
															         const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
														
 
															         const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
														
@@ -1519,7 +1521,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
															         }
														
 
															         // sumf += dall * isum - dmin * summs in 32bits
														
 
															-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
														
 
															+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
														
 
															     }
														
@@ -1670,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
															         summs += dmin * smin;
														
 
															         const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
														
 
															-        const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
														
 
															-        const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
														
 
															+        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
														
 
															+        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
														
 
															         const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
														
 
															         const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
														
@@ -1735,10 +1737,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
														
 
															         const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
														
 
															-        const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
														
 
															-        const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
														
 
															-        const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
														
 
															-        const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
														
 
															+        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
														
 
															+        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
														
 
															+        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
														
 
															+        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
														
@@ -1943,7 +1945,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
														
 
															         const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
														
 
															         const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
														
 
															-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
														
 
															+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
														
 
															         // high bit
														
 
															         const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
														
@@ -2154,7 +2156,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
															         }
														
 
															         // multiply with block scale and accumulate
														
 
															-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
														
 
															+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
														
 
															     }
														
@@ -2329,13 +2331,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
															         aux16[0] = a & 0x0f0f;
														
 
															         aux16[1] = (a >> 4) & 0x0f0f;
														
 
															-        const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
														
 
															-        const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
														
 
															+        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
														
 
															+        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
														
 
															         memcpy(&aux64, x[i].hmask, 8);
														
 
															         const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
														
 
															-        __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
														
 
															+        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
														
 
															         __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
														
 
															         q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
														
 
															         q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
														
@@ -2344,7 +2346,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
														
 
															         // prepare low and high bits
														
 
															-        const __m256i q3aux  = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
														
 
															+        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
														
 
															         const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
														
 
															         const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
														
@@ -2455,7 +2457,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
 
															         p16_0 = _mm_add_epi32(p16_0, p16_2);
														
 
															         p16_1 = _mm_add_epi32(p16_1, p16_3);
														
 
															-        __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
														
 
															+        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
														
 
															         // multiply with block scale and accumulate
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
														
@@ -2646,7 +2648,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
															         acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
														
 
															         const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
														
 
															-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
														
 
															+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
														
 
															         __m256i sumi = _mm256_setzero_si256();
														
@@ -2753,7 +2755,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
															         }
														
 
															         __m256 vd = _mm256_set1_ps(d);
														
 
															-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
														
 
															+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
														
 
															     }
														
@@ -2994,11 +2996,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
														
 
															         const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
														
 
															-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
														
 
															+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
														
 
															         const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
														
 
															         const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
														
 
															-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
														
 
															+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
														
 
															     }
														
@@ -3186,7 +3188,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
															         summs += dmin * _mm_extract_epi32(hsum, 0);
														
 
															         const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
														
 
															-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
														
 
															+        const __m256i scales = MM256_SET_M128I(sc128, sc128);
														
 
															         const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
														
 
															         __m256i hmask = mone;
														
@@ -3325,7 +3327,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
															         }
														
 
															         __m256 vd = _mm256_set1_ps(d);
														
 
															-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
														
 
															+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
														
 
															     }
														
@@ -3488,13 +3490,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
														
 
															-        const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
														
 
															-        const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
														
 
															+        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
														
 
															+        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
														
 
															         int64_t aux64;
														
 
															         memcpy(&aux64, x[i].qh, 8);
														
 
															         const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
														
 
															-        const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
														
 
															+        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
														
 
															         const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
														
 
															         const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
														
@@ -3569,7 +3571,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
														
 
															         const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
														
 
															-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
														
 
															+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
														
 
															     }
														
@@ -3951,7 +3953,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
															         }
														
 
															-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
														
 
															+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
														
 
															         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
														
 
															     }
														
@@ -4109,8 +4111,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
															         const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
														
 
															         const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
														
 
															-        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
														
 
															-        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
														
 
															+        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
														
 
															+        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
														
 
															         const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
														
 
															         const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
														
@@ -4203,7 +4205,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
 
															         sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
														
 
															         sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
														
 
															-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
														
 
															+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
														
 
															     }
														
 
															     *s = hsum_float_8(acc);
														
--- a/llama/k_quants.h
+++ b/llama/k_quants.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/llama-util.h
+++ b/llama/llama-util.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -82,8 +82,14 @@
 
															 #pragma warning(disable: 4244 4267) // possible loss of data
														
 
															 #endif
														
 
															+#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
														
 
															+#include "ggml-alloc.h"
														
 
															+#define LLAMA_USE_ALLOCATOR
														
 
															+#else
														
 
															 #define LLAMA_USE_SCRATCH
														
 
															 #define LLAMA_MAX_SCRATCH_BUFFERS 16
														
 
															+#endif
														
 
															+
														
 
															 // available llama models
														
 
															 enum e_model {
														
@@ -353,13 +359,22 @@ struct llama_model {
 
															 struct llama_context {
														
 
															     llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
														
 
															-#ifdef GGML_USE_METAL
														
 
															     ~llama_context() {
														
 
															+        if (model_owner) {
														
 
															+            delete &model;
														
 
															+        }
														
 
															+#ifdef GGML_USE_METAL
														
 
															         if (ctx_metal) {
														
 
															             ggml_metal_free(ctx_metal);
														
 
															         }
														
 
															-    }
														
 
															 #endif
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+        if (alloc) {
														
 
															+            ggml_allocr_free(alloc);
														
 
															+        }
														
 
															+#endif
														
 
															+    }
														
 
															+
														
 
															     std::mt19937 rng;
														
 
															     bool has_evaluated_once = false;
														
@@ -397,7 +412,17 @@ struct llama_context {
 
															     // memory buffers used to evaluate the model
														
 
															     // TODO: move in llama_state
														
 
															     llama_ctx_buffer buf_compute;
														
 
															+
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+    llama_ctx_buffer buf_alloc;
														
 
															+    ggml_allocr * alloc = NULL;
														
 
															+#endif
														
 
															+
														
 
															+#ifdef LLAMA_USE_SCRATCH
														
 
															     llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
														
 
															+    int    buf_last = 0;
														
 
															+    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
														
 
															+#endif
														
 
															 #ifdef GGML_USE_METAL
														
 
															     ggml_metal_context * ctx_metal = NULL;
														
@@ -407,9 +432,6 @@ struct llama_context {
 
															     ggml_mpi_context * ctx_mpi = NULL;
														
 
															 #endif
														
 
															-    int    buf_last = 0;
														
 
															-    size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
														
 
															-
														
 
															     void use_buf(struct ggml_context * ctx, int i) {
														
 
															 #if defined(LLAMA_USE_SCRATCH)
														
 
															         size_t last_size = 0;
														
@@ -905,6 +927,7 @@ struct llama_context_params llama_context_default_params() {
 
															         /*.progress_callback           =*/ nullptr,
														
 
															         /*.progress_callback_user_data =*/ nullptr,
														
 
															         /*.low_vram                    =*/ false,
														
 
															+        /*.mul_mat_q                   =*/ false,
														
 
															         /*.f16_kv                      =*/ true,
														
 
															         /*.logits_all                  =*/ false,
														
 
															         /*.vocab_only                  =*/ false,
														
@@ -1032,6 +1055,7 @@ static void llama_model_load_internal(
 
															         int n_gpu_layers,
														
 
															         int main_gpu,
														
 
															         const float * tensor_split,
														
 
															+        const bool mul_mat_q,
														
 
															         float rope_freq_base,
														
 
															         float rope_freq_scale,
														
 
															         bool low_vram,
														
@@ -1160,9 +1184,11 @@ static void llama_model_load_internal(
 
															     }
														
 
															     (void) main_gpu;
														
 
															+    (void) mul_mat_q;
														
 
															 #if defined(GGML_USE_CUBLAS)
														
 
															     fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
														
 
															     ggml_cuda_set_main_device(main_gpu);
														
 
															+    ggml_cuda_set_mul_mat_q(mul_mat_q);
														
 
															 #define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
														
 
															 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
														
 
															 #elif defined(GGML_USE_CLBLAST)
														
@@ -1256,12 +1282,16 @@ static void llama_model_load_internal(
 
															         const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
														
 
															         // this is the total memory required to run the inference
														
 
															-        const size_t mem_required =
														
 
															+        size_t mem_required =
														
 
															             ctx_size +
														
 
															-            mmapped_size - vram_weights + // weights in VRAM not in memory
														
 
															+            mmapped_size - vram_weights; // weights in VRAM not in memory
														
 
															+
														
 
															+#ifndef LLAMA_USE_ALLOCATOR
														
 
															+        mem_required +=
														
 
															             MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
														
 
															             MEM_REQ_SCRATCH1().at(model.type) +
														
 
															             MEM_REQ_EVAL().at(model.type);
														
 
															+#endif
														
 
															         // this is the memory required by one llama_state
														
 
															         const size_t mem_required_state =
														
@@ -1367,6 +1397,7 @@ static bool llama_model_load(
 
															         int n_gpu_layers,
														
 
															         int main_gpu,
														
 
															         const float * tensor_split,
														
 
															+        const bool mul_mat_q,
														
 
															         float rope_freq_base,
														
 
															         float rope_freq_scale,
														
 
															         bool low_vram,
														
@@ -1377,7 +1408,8 @@ static bool llama_model_load(
 
															         llama_progress_callback progress_callback,
														
 
															         void *progress_callback_user_data) {
														
 
															     try {
														
 
															-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
														
 
															+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
														
 
															+                                  main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
														
 
															                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
														
 
															         return true;
														
 
															     } catch (const std::exception & err) {
														
@@ -1386,32 +1418,15 @@ static bool llama_model_load(
 
															     }
														
 
															 }
														
 
															-// evaluate the transformer
														
 
															-//
														
 
															-//   - lctx:      llama context
														
 
															-//   - tokens:    new batch of tokens to process
														
 
															-//   - embd       embeddings input
														
 
															-//   - n_tokens   number of tokens
														
 
															-//   - n_past:    the context size so far
														
 
															-//   - n_threads: number of threads to use
														
 
															-//
														
 
															-static bool llama_eval_internal(
														
 
															+static struct ggml_cgraph * llama_build_graph(
														
 
															          llama_context & lctx,
														
 
															      const llama_token * tokens,
														
 
															            const float * embd,
														
 
															                    int   n_tokens,
														
 
															-                   int   n_past,
														
 
															-                   int   n_threads,
														
 
															-            const char * cgraph_fname) {
														
 
															+                   int   n_past) {
														
 
															     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
														
 
															-#ifdef GGML_USE_MPI
														
 
															-    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
														
 
															-#endif
														
 
															-
														
 
															-    const int64_t t_start_us = ggml_time_us();
														
 
															-
														
 
															     const int N = n_tokens;
														
 
															     const auto & model   = lctx.model;
														
@@ -1427,10 +1442,8 @@ static bool llama_eval_internal(
 
															     const int64_t n_head      = hparams.n_head;
														
 
															     const int64_t n_head_kv   = hparams.n_head_kv;
														
 
															     const int64_t n_embd_head = hparams.n_embd_head();
														
 
															-    const int64_t n_vocab     = hparams.n_vocab;
														
 
															     const int64_t n_embd_gqa  = hparams.n_embd_gqa();
														
 
															-
														
 
															     LLAMA_ASSERT(n_embd_head == hparams.n_rot);
														
 
															     const float freq_base  = hparams.rope_freq_base;
														
@@ -1442,26 +1455,35 @@ static bool llama_eval_internal(
 
															     auto & mem_per_token = lctx.mem_per_token;
														
 
															     auto & buf_compute   = lctx.buf_compute;
														
 
															+
														
 
															     struct ggml_init_params params = {
														
 
															         /*.mem_size   =*/ buf_compute.size,
														
 
															         /*.mem_buffer =*/ buf_compute.addr,
														
 
															         /*.no_alloc   =*/ false,
														
 
															     };
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+    params.no_alloc = true;
														
 
															+#endif
														
 
															+
														
 
															     struct ggml_context * ctx0 = ggml_init(params);
														
 
															     ggml_cgraph * gf = ggml_new_graph(ctx0);
														
 
															-    // for big prompts, if BLAS is enabled, it is better to use only one thread
														
 
															-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
														
 
															-    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
														
 
															-
														
 
															     struct ggml_tensor * cur;
														
 
															     struct ggml_tensor * inpL;
														
 
															     if (tokens) {
														
 
															         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
														
 
															+
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+        ggml_allocr_alloc(lctx.alloc, inp_tokens);
														
 
															+        if (!ggml_allocr_is_measure(lctx.alloc)) {
														
 
															+            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
														
 
															+        }
														
 
															+#else
														
 
															         memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
														
 
															+#endif
														
 
															         ggml_set_name(inp_tokens, "inp_tokens");
														
 
															         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
														
@@ -1471,7 +1493,15 @@ static bool llama_eval_internal(
 
															 #endif
														
 
															         inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
														
 
															+
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+        ggml_allocr_alloc(lctx.alloc, inpL);
														
 
															+        if (!ggml_allocr_is_measure(lctx.alloc)) {
														
 
															+            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
														
 
															+        }
														
 
															+#else
														
 
															         memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
														
 
															+#endif
														
 
															     }
														
 
															     const int i_gpu_start = n_layer - n_gpu_layers;
														
@@ -1498,6 +1528,17 @@ static bool llama_eval_internal(
 
															     }
														
 
															 #endif // GGML_USE_CUBLAS
														
 
															+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+    ggml_allocr_alloc(lctx.alloc, KQ_scale);
														
 
															+    if (!ggml_allocr_is_measure(lctx.alloc)) {
														
 
															+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
														
 
															+    }
														
 
															+#else
														
 
															+    ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
														
 
															+#endif
														
 
															+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
														
 
															+
														
 
															     for (int il = 0; il < n_layer; ++il) {
														
 
															         ggml_format_name(inpL, "layer_inp_%d", il);
														
@@ -1593,9 +1634,6 @@ static bool llama_eval_internal(
 
															             ggml_set_name(KQ, "KQ");
														
 
															             // KQ_scaled = KQ / sqrt(n_embd_head)
														
 
															-            struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
														
 
															-            ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
														
 
															-
														
 
															             // KQ_scaled shape [n_past + N, N, n_head, 1]
														
 
															             struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
														
 
															             offload_func_kq(KQ_scaled);
														
@@ -1711,9 +1749,6 @@ static bool llama_eval_internal(
 
															     lctx.use_buf(ctx0, 0);
														
 
															-    // used at the end to optionally extract the embeddings
														
 
															-    struct ggml_tensor * embeddings = NULL;
														
 
															-
														
 
															     // norm
														
 
															     {
														
 
															         cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
														
@@ -1724,8 +1759,6 @@ static bool llama_eval_internal(
 
															         cur = ggml_mul(ctx0, cur, model.norm);
														
 
															         // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
														
 
															         ggml_set_name(cur, "result_norm");
														
 
															-
														
 
															-        embeddings = cur;
														
 
															     }
														
 
															     // lm_head
														
@@ -1737,12 +1770,88 @@ static bool llama_eval_internal(
 
															     // logits -> probs
														
 
															     //cur = ggml_soft_max_inplace(ctx0, cur);
														
 
															-    // run the computation
														
 
															     ggml_build_forward_expand(gf, cur);
														
 
															-    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
														
 
															+    if (mem_per_token == 0) {
														
 
															+        mem_per_token = ggml_used_mem(ctx0)/N;
														
 
															+    }
														
 
															+
														
 
															+#if 0
														
 
															+    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
														
 
															+            ggml_used_mem(ctx0)/1024.0/1024.0,
														
 
															+            lctx.get_buf_max_mem(0)/1024.0/1024.0,
														
 
															+            lctx.get_buf_max_mem(1)/1024.0/1024.0,
														
 
															+            lctx.work_buffer.size()/1024.0/1024.0,
														
 
															+            n_past, N);
														
 
															+#endif
														
 
															+
														
 
															+    ggml_free(ctx0);
														
 
															+
														
 
															+    return gf;
														
 
															+}
														
 
															+
														
 
															+// evaluate the transformer
														
 
															+//
														
 
															+//   - lctx:      llama context
														
 
															+//   - tokens:    new batch of tokens to process
														
 
															+//   - embd       embeddings input
														
 
															+//   - n_tokens   number of tokens
														
 
															+//   - n_past:    the context size so far
														
 
															+//   - n_threads: number of threads to use
														
 
															+//
														
 
															+static bool llama_eval_internal(
														
 
															+         llama_context & lctx,
														
 
															+     const llama_token * tokens,
														
 
															+           const float * embd,
														
 
															+                   int   n_tokens,
														
 
															+                   int   n_past,
														
 
															+                   int   n_threads,
														
 
															+            const char * cgraph_fname) {
														
 
															+
														
 
															+    LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
														
 
															+
														
 
															+    const int64_t t_start_us = ggml_time_us();
														
 
															+
														
 
															+#ifdef GGML_USE_MPI
														
 
															+    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
														
 
															+#endif
														
 
															+
														
 
															+    const int N = n_tokens;
														
 
															+
														
 
															+    const auto & model   = lctx.model;
														
 
															+    const auto & hparams = model.hparams;
														
 
															+
														
 
															+    const auto & kv_self = lctx.kv_self;
														
 
															+
														
 
															+    LLAMA_ASSERT(!!kv_self.ctx);
														
 
															+
														
 
															+    const int64_t n_embd      = hparams.n_embd;
														
 
															+    const int64_t n_vocab     = hparams.n_vocab;
														
 
															+
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+    ggml_allocr_reset(lctx.alloc);
														
 
															+#endif
														
 
															+
														
 
															+    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
														
 
															+
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+    ggml_allocr_alloc_graph(lctx.alloc, gf);
														
 
															+#endif
														
 
															+
														
 
															+    // fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
														
 
															+
														
 
															+    // for big prompts, if BLAS is enabled, it is better to use only one thread
														
 
															+    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
														
 
															+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
														
 
															+
														
 
															+    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
														
 
															+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
														
 
															+
														
 
															+    LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
														
 
															+    LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
														
 
															 #if GGML_USE_MPI
														
 
															+    const int64_t n_layer = hparams.n_layer;
														
 
															     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
														
 
															 #endif
														
@@ -1754,7 +1863,10 @@ static bool llama_eval_internal(
 
															         //}
														
 
															         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
														
 
															         ggml_metal_graph_compute(lctx.ctx_metal, gf);
														
 
															-        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
														
 
															+        ggml_metal_get_tensor   (lctx.ctx_metal, res);
														
 
															+        if (!lctx.embedding.empty()) {
														
 
															+            ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
														
 
															+        }
														
 
															     } else {
														
 
															         // IMPORTANT:
														
 
															         // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
														
@@ -1785,8 +1897,6 @@ static bool llama_eval_internal(
 
															     // update kv token count
														
 
															     lctx.kv_self.n = n_past + N;
														
 
															-    struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
														
 
															-
														
 
															     if (cgraph_fname) {
														
 
															         ggml_graph_export(gf, cgraph_fname);
														
 
															     }
														
@@ -1824,21 +1934,6 @@ static bool llama_eval_internal(
 
															         memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
														
 
															     }
														
 
															-    if (mem_per_token == 0) {
														
 
															-        mem_per_token = ggml_used_mem(ctx0)/N;
														
 
															-    }
														
 
															-
														
 
															-#if 0
														
 
															-    printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
														
 
															-            ggml_used_mem(ctx0)/1024.0/1024.0,
														
 
															-            lctx.get_buf_max_mem(0)/1024.0/1024.0,
														
 
															-            lctx.get_buf_max_mem(1)/1024.0/1024.0,
														
 
															-            lctx.work_buffer.size()/1024.0/1024.0,
														
 
															-            n_past, N);
														
 
															-#endif
														
 
															-
														
 
															-    ggml_free(ctx0);
														
 
															-
														
 
															     // measure the performance only for the single-token evals
														
 
															     if (N == 1) {
														
 
															         lctx.t_eval_us += ggml_time_us() - t_start_us;
														
@@ -1950,7 +2045,9 @@ struct llama_tokenizer {
 
															             if (token == vocab_.token_to_id.end()) {
														
 
															                 // output any symbols that did not form tokens as bytes.
														
 
															                 for (int j = 0; j < (int) symbol.n; ++j) {
														
 
															-                    llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
														
 
															+                    // NOTE: old version, before #2420 - not sure what are the implications of this
														
 
															+                    //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
														
 
															+                    llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
														
 
															                     output.push_back(token_id);
														
 
															                 }
														
 
															             } else {
														
@@ -3127,7 +3224,7 @@ struct llama_model * llama_load_model_from_file(
 
															     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
														
 
															     if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
														
 
															-                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
														
 
															+                params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
														
 
															                 memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
														
 
															                 params.progress_callback_user_data)) {
														
 
															         delete model;
														
@@ -3204,10 +3301,47 @@ struct llama_context * llama_new_context_with_model(
 
															             ctx->embedding.resize(hparams.n_embd);
														
 
															         }
														
 
															+#ifdef LLAMA_USE_ALLOCATOR
														
 
															+        {
														
 
															+            static const size_t tensor_alignment = 32;
														
 
															+            // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
														
 
															+            ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
														
 
															+
														
 
															+            // create measure allocator
														
 
															+            ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
														
 
															+
														
 
															+            // build worst-case graph
														
 
															+            int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
														
 
															+            int n_past = hparams.n_ctx - n_tokens;
														
 
															+            llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
														
 
															+            ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
														
 
															+
														
 
															+            // measure memory requirements for the graph
														
 
															+            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
														
 
															+
														
 
															+            fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
														
 
															+
														
 
															+            // debug - for comparison with scratch buffer
														
 
															+            //size_t prev_req =
														
 
															+            //    MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
														
 
															+            //    MEM_REQ_SCRATCH1().at(ctx->model.type) +
														
 
															+            //    MEM_REQ_EVAL().at(ctx->model.type);
														
 
															+            //fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
														
 
															+
														
 
															+            // recreate allocator with exact memory requirements
														
 
															+            ggml_allocr_free(ctx->alloc);
														
 
															+
														
 
															+            ctx->buf_alloc.resize(alloc_size);
														
 
															+            ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
														
 
															+        }
														
 
															+#else
														
 
															         ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
														
 
															+#endif
														
 
															+#ifdef LLAMA_USE_SCRATCH
														
 
															         ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
														
 
															         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
														
 
															+#endif
														
 
															     }
														
 
															 #ifdef GGML_USE_METAL
														
@@ -3277,9 +3411,6 @@ struct llama_context * llama_init_from_file(
 
															 }
														
 
															 void llama_free(struct llama_context * ctx) {
														
 
															-    if (ctx->model_owner) {
														
 
															-        delete &ctx->model;
														
 
															-    }
														
 
															     delete ctx;
														
 
															 }
														
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -128,6 +128,11 @@ func New(model string, opts api.Options) (*LLM, error) {
 
															 	C.llama_backend_init(C.bool(llm.UseNUMA))
														
 
															+	// TODO: GQA == 8 suggests 70B model which doesn't support metal
														
 
															+	if llm.NumGQA == 8 {
														
 
															+		llm.NumGPU = 0
														
 
															+	}
														
 
															+
														
 
															 	params := C.llama_context_default_params()
														
 
															 	params.seed = C.uint(llm.Seed)
														
 
															 	params.n_ctx = C.int(llm.NumCtx)
														
--- a/llama/llama.h
+++ b/llama/llama.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d91f3f0c55663719ea03b76311e8c36ed55eb0e2
														
 
															+ * llama.cpp - git c574bddb368424b5996cbee2ec45ec050967d404
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -134,6 +134,7 @@ extern "C" {
 
															         // Keep the booleans together to avoid misalignment during copy-by-value.
														
 
															         bool low_vram;   // if true, reduce VRAM usage at the cost of performance
														
 
															+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
														
 
															         bool f16_kv;     // use fp16 for KV cache
														
 
															         bool logits_all; // the llama_eval() call computes all logits, not just the last one
														
 
															         bool vocab_only; // only load the vocabulary, no weights