11 months ago · 01ccbc07fe
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@
 
				 .swp
			
 
				 dist
			
 
				 ollama
			
 
				-ggml-metal.metal
			
 
				 .cache
			
 
				 *.exe
			
 
				 .idea
			
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,985 +1,985 @@
 
				-#include "ggml-alloc.h"

			
 
				-#include "ggml-backend-impl.h"

			
 
				-#include "ggml.h"

			
 
				-#include "ggml-impl.h"

			
 
				-#include <assert.h>

			
 
				-#include <limits.h>

			
 
				-#include <stdarg.h>

			
 
				-#include <stdio.h>

			
 
				-#include <stdlib.h>

			
 
				-#include <string.h>

			
 
				-

			
 
				-#define MAX(a, b) ((a) > (b) ? (a) : (b))

			
 
				-#define MAX_FREE_BLOCKS 256

			
 
				-

			
 
				-//#define GGML_ALLOCATOR_DEBUG

			
 
				-

			
 
				-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)

			
 
				-#define AT_PRINTF(...)

			
 
				-

			
 
				-

			
 
				-static bool ggml_is_view(const struct ggml_tensor * t) {

			
 
				-    return t->view_src != NULL;

			
 
				-}

			
 
				-

			
 
				-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {

			
 
				-    if (a->type != b->type) {

			
 
				-        return false;

			
 
				-    }

			
 
				-    for (int i = 0; i < GGML_MAX_DIMS; i++) {

			
 
				-        if (a->ne[i] != b->ne[i]) {

			
 
				-            return false;

			
 
				-        }

			
 
				-        if (a->nb[i] != b->nb[i]) {

			
 
				-            return false;

			
 
				-        }

			
 
				-    }

			
 
				-    return true;

			
 
				-}

			
 
				-

			
 
				-static bool ggml_op_can_inplace(enum ggml_op op) {

			
 
				-    switch (op) {

			
 
				-        case GGML_OP_SCALE:

			
 
				-        case GGML_OP_DIAG_MASK_ZERO:

			
 
				-        case GGML_OP_DIAG_MASK_INF:

			
 
				-        case GGML_OP_ADD:

			
 
				-        case GGML_OP_ADD1:

			
 
				-        case GGML_OP_SUB:

			
 
				-        case GGML_OP_MUL:

			
 
				-        case GGML_OP_DIV:

			
 
				-        case GGML_OP_SQR:

			
 
				-        case GGML_OP_SQRT:

			
 
				-        case GGML_OP_LOG:

			
 
				-        case GGML_OP_UNARY:

			
 
				-        case GGML_OP_ROPE:

			
 
				-        case GGML_OP_RMS_NORM:

			
 
				-        case GGML_OP_SOFT_MAX:

			
 
				-            return true;

			
 
				-

			
 
				-        default:

			
 
				-            return false;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {

			
 
				-    assert(alignment && !(alignment & (alignment - 1))); // power of 2

			
 
				-    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;

			
 
				-    return offset + align;

			
 
				-}

			
 
				-

			
 
				-// tallocr

			
 
				-

			
 
				-struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {

			
 
				-    void * base = ggml_backend_buffer_get_base(buffer);

			
 
				-    size_t align = ggml_backend_buffer_get_alignment(buffer);

			
 
				-

			
 
				-    assert(align && !(align & (align - 1))); // power of 2

			
 
				-

			
 
				-    struct ggml_tallocr talloc = (struct ggml_tallocr) {

			
 
				-        /*.buffer    = */ buffer,

			
 
				-        /*.base      = */ base,

			
 
				-        /*.alignment = */ align,

			
 
				-        /*.offset    = */ aligned_offset(base, 0, align),

			
 
				-    };

			
 
				-    return talloc;

			
 
				-}

			
 
				-

			
 
				-void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {

			
 
				-    size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);

			
 
				-    size = GGML_PAD(size, talloc->alignment);

			
 
				-

			
 
				-    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {

			
 
				-        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",

			
 
				-                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);

			
 
				-        GGML_ASSERT(!"not enough space in the buffer");

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;

			
 
				-    talloc->offset += size;

			
 
				-

			
 
				-    assert(((uintptr_t)addr % talloc->alignment) == 0);

			
 
				-

			
 
				-    ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);

			
 
				-}

			
 
				-

			
 
				-// dynamic tensor allocator

			
 
				-

			
 
				-struct free_block {

			
 
				-    size_t offset;

			
 
				-    size_t size;

			
 
				-};

			
 
				-

			
 
				-struct ggml_dyn_tallocr {

			
 
				-    size_t alignment;

			
 
				-    int n_free_blocks;

			
 
				-    struct free_block free_blocks[MAX_FREE_BLOCKS];

			
 
				-    size_t max_size;

			
 
				-

			
 
				-#ifdef GGML_ALLOCATOR_DEBUG

			
 
				-    struct {

			
 
				-        const struct ggml_tensor * tensor;

			
 
				-        size_t offset;

			
 
				-    } allocated_tensors[1024];

			
 
				-#endif

			
 
				-};

			
 
				-

			
 
				-#ifdef GGML_ALLOCATOR_DEBUG

			
 
				-static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {

			
 
				-    for (int i = 0; i < 1024; i++) {

			
 
				-        if (alloc->allocated_tensors[i].tensor == NULL) {

			
 
				-            alloc->allocated_tensors[i].tensor = tensor;

			
 
				-            alloc->allocated_tensors[i].offset = offset;

			
 
				-            return;

			
 
				-        }

			
 
				-    }

			
 
				-    GGML_ASSERT(!"out of allocated_tensors");

			
 
				-}

			
 
				-static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {

			
 
				-    for (int i = 0; i < 1024; i++) {

			
 
				-        if (alloc->allocated_tensors[i].offset == offset) {

			
 
				-            alloc->allocated_tensors[i].tensor = NULL;

			
 
				-            return;

			
 
				-        }

			
 
				-    }

			
 
				-    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);

			
 
				-    GGML_ASSERT(!"tensor not found");

			
 
				-}

			
 
				-#endif

			
 
				-

			
 
				-static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {

			
 
				-    size = aligned_offset(NULL, size, alloc->alignment);

			
 
				-

			
 
				-    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);

			
 
				-

			
 
				-    size_t max_avail = 0;

			
 
				-

			
 
				-    // find the best fitting free block besides the last block

			
 
				-    int best_fit_block = -1;

			
 
				-    size_t best_fit_size = SIZE_MAX;

			
 
				-    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {

			
 
				-        struct free_block * block = &alloc->free_blocks[i];

			
 
				-        max_avail = MAX(max_avail, block->size);

			
 
				-        if (block->size >= size && block->size <= best_fit_size) {

			
 
				-            best_fit_block = i;

			
 
				-            best_fit_size = block->size;

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    if (best_fit_block == -1) {

			
 
				-        // the last block is our last resort

			
 
				-        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];

			
 
				-        max_avail = MAX(max_avail, block->size);

			
 
				-        if (block->size >= size) {

			
 
				-            best_fit_block = alloc->n_free_blocks - 1;

			
 
				-        } else {

			
 
				-            // this should never happen

			
 
				-            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",

			
 
				-                    __func__, size, max_avail);

			
 
				-            GGML_ASSERT(!"not enough space in the buffer");

			
 
				-            GGML_UNREACHABLE();

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    struct free_block * block = &alloc->free_blocks[best_fit_block];

			
 
				-    size_t offset = block->offset;

			
 
				-    block->offset = offset + size;

			
 
				-    block->size -= size;

			
 
				-    if (block->size == 0) {

			
 
				-        // remove block if empty

			
 
				-        alloc->n_free_blocks--;

			
 
				-        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {

			
 
				-            alloc->free_blocks[j] = alloc->free_blocks[j+1];

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);

			
 
				-

			
 
				-#ifdef GGML_ALLOCATOR_DEBUG

			
 
				-    add_allocated_tensor(alloc, offset, tensor);

			
 
				-    size_t cur_max = offset + size;

			
 
				-    if (cur_max > alloc->max_size) {

			
 
				-        // sort allocated_tensors by offset

			
 
				-        for (int i = 0; i < 1024; i++) {

			
 
				-            for (int j = i + 1; j < 1024; j++) {

			
 
				-                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {

			
 
				-                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;

			
 
				-                    size_t tmp_offset = alloc->allocated_tensors[i].offset;

			
 
				-                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;

			
 
				-                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;

			
 
				-                    alloc->allocated_tensors[j].tensor = tmp_tensor;

			
 
				-                    alloc->allocated_tensors[j].offset = tmp_offset;

			
 
				-                }

			
 
				-            }

			
 
				-        }

			
 
				-        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);

			
 
				-        for (int i = 0; i < 1024; i++) {

			
 
				-            if (alloc->allocated_tensors[i].tensor) {

			
 
				-                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,

			
 
				-                    alloc->allocated_tensors[i].offset,

			
 
				-                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),

			
 
				-                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);

			
 
				-            }

			
 
				-        }

			
 
				-        fprintf(stderr, "\n");

			
 
				-    }

			
 
				-#endif

			
 
				-

			
 
				-    alloc->max_size = MAX(alloc->max_size, offset + size);

			
 
				-

			
 
				-    return offset;

			
 
				-

			
 
				-    GGML_UNUSED(tensor);

			
 
				-}

			
 
				-

			
 
				-// this is a very naive implementation, but for our case the number of free blocks should be very small

			
 
				-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {

			
 
				-    size = aligned_offset(NULL, size, alloc->alignment);

			
 
				-

			
 
				-    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);

			
 
				-

			
 
				-#ifdef GGML_ALLOCATOR_DEBUG

			
 
				-    remove_allocated_tensor(alloc, offset, tensor);

			
 
				-#endif

			
 
				-

			
 
				-    // see if we can merge with an existing block

			
 
				-    for (int i = 0; i < alloc->n_free_blocks; i++) {

			
 
				-        struct free_block * block = &alloc->free_blocks[i];

			
 
				-        // check if ptr is at the end of the block

			
 
				-        if (block->offset + block->size == offset) {

			
 
				-            block->size += size;

			
 
				-            // check if we can merge with the next block

			
 
				-            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {

			
 
				-                block->size += alloc->free_blocks[i+1].size;

			
 
				-                alloc->n_free_blocks--;

			
 
				-                for (int j = i+1; j < alloc->n_free_blocks; j++) {

			
 
				-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];

			
 
				-                }

			
 
				-            }

			
 
				-            return;

			
 
				-        }

			
 
				-        // check if ptr is at the beginning of the block

			
 
				-        if (offset + size == block->offset) {

			
 
				-            block->offset = offset;

			
 
				-            block->size += size;

			
 
				-            // check if we can merge with the previous block

			
 
				-            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {

			
 
				-                alloc->free_blocks[i-1].size += block->size;

			
 
				-                alloc->n_free_blocks--;

			
 
				-                for (int j = i; j < alloc->n_free_blocks; j++) {

			
 
				-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];

			
 
				-                }

			
 
				-            }

			
 
				-            return;

			
 
				-        }

			
 
				-    }

			
 
				-    // otherwise, add a new block

			
 
				-    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");

			
 
				-    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)

			
 
				-    int insert_pos = 0;

			
 
				-    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {

			
 
				-        insert_pos++;

			
 
				-    }

			
 
				-    // shift all blocks from insert_pos onward to make room for the new block

			
 
				-    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {

			
 
				-        alloc->free_blocks[i] = alloc->free_blocks[i-1];

			
 
				-    }

			
 
				-    // insert the new block

			
 
				-    alloc->free_blocks[insert_pos].offset = offset;

			
 
				-    alloc->free_blocks[insert_pos].size = size;

			
 
				-    alloc->n_free_blocks++;

			
 
				-

			
 
				-    GGML_UNUSED(tensor);

			
 
				-}

			
 
				-

			
 
				-static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {

			
 
				-    alloc->n_free_blocks = 1;

			
 
				-    alloc->free_blocks[0].offset = 0;

			
 
				-    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows

			
 
				-    alloc->max_size = 0;

			
 
				-}

			
 
				-

			
 
				-static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {

			
 
				-    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));

			
 
				-

			
 
				-    *alloc = (struct ggml_dyn_tallocr) {

			
 
				-        /*.alignment     = */ alignment,

			
 
				-        /*.n_free_blocks = */ 0,

			
 
				-        /*.free_blocks   = */ {{0}},

			
 
				-        /*.max_size      = */ 0,

			
 
				-#ifdef GGML_ALLOCATOR_DEBUG

			
 
				-        /*.allocated_tensors = */ {{0}},

			
 
				-#endif

			
 
				-    };

			
 
				-

			
 
				-    ggml_dyn_tallocr_reset(alloc);

			
 
				-

			
 
				-    return alloc;

			
 
				-}

			
 
				-

			
 
				-static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {

			
 
				-    free(alloc);

			
 
				-}

			
 
				-

			
 
				-static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {

			
 
				-    return alloc->max_size;

			
 
				-}

			
 
				-

			
 
				-

			
 
				-/////////////////////////////////////

			
 
				-

			
 
				-// graph allocator

			
 
				-

			
 
				-struct hash_node {

			
 
				-    int n_children;

			
 
				-    int n_views;

			
 
				-    int buffer_id;

			
 
				-    size_t offset; // offset within the buffer

			
 
				-    bool allocated;

			
 
				-};

			
 
				-

			
 
				-struct tensor_alloc {

			
 
				-    size_t offset;

			
 
				-    size_t size_max; // 0 = pre-allocated, unused, or view

			
 
				-};

			
 
				-

			
 
				-struct leaf_alloc {

			
 
				-    int buffer_id;

			
 
				-    struct tensor_alloc leaf;

			
 
				-};

			
 
				-

			
 
				-struct node_alloc {

			
 
				-    int buffer_id;

			
 
				-    struct tensor_alloc dst;

			
 
				-    struct tensor_alloc src[GGML_MAX_SRC];

			
 
				-};

			
 
				-

			
 
				-struct ggml_gallocr {

			
 
				-    ggml_backend_buffer_type_t * bufts; // [n_buffers]

			
 
				-    ggml_backend_buffer_t * buffers; // [n_buffers]

			
 
				-    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]

			
 
				-    int n_buffers;

			
 
				-

			
 
				-    struct ggml_hash_set hash_set;

			
 
				-    struct hash_node * hash_values; // [hash_set.size]

			
 
				-

			
 
				-    struct node_alloc * node_allocs; // [n_nodes]

			
 
				-    int n_nodes;

			
 
				-

			
 
				-    struct leaf_alloc * leaf_allocs; // [n_leafs]

			
 
				-    int n_leafs;

			
 
				-};

			
 
				-

			
 
				-ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {

			
 
				-    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));

			
 
				-    GGML_ASSERT(galloc != NULL);

			
 
				-

			
 
				-    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));

			
 
				-    GGML_ASSERT(galloc->bufts != NULL);

			
 
				-

			
 
				-    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);

			
 
				-    GGML_ASSERT(galloc->buffers != NULL);

			
 
				-

			
 
				-    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));

			
 
				-    GGML_ASSERT(galloc->buf_tallocs != NULL);

			
 
				-

			
 
				-    for (int i = 0; i < n_bufs; i++) {

			
 
				-        galloc->bufts[i] = bufts[i];

			
 
				-        galloc->buffers[i] = NULL;

			
 
				-        size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);

			
 
				-        galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);

			
 
				-    }

			
 
				-    galloc->n_buffers = n_bufs;

			
 
				-

			
 
				-    return galloc;

			
 
				-}

			
 
				-

			
 
				-ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {

			
 
				-    return ggml_gallocr_new_n(&buft, 1);

			
 
				-}

			
 
				-

			
 
				-void ggml_gallocr_free(ggml_gallocr_t galloc) {

			
 
				-    if (galloc == NULL) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    for (int i = 0; i < galloc->n_buffers; i++) {

			
 
				-        if (galloc->buffers != NULL) {

			
 
				-            ggml_backend_buffer_free(galloc->buffers[i]);

			
 
				-        }

			
 
				-        if (galloc->buf_tallocs != NULL) {

			
 
				-            ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    free(galloc->hash_set.keys);

			
 
				-    free(galloc->hash_values);

			
 
				-    free(galloc->bufts);

			
 
				-    free(galloc->buffers);

			
 
				-    free(galloc->buf_tallocs);

			
 
				-    free(galloc->node_allocs);

			
 
				-    free(galloc->leaf_allocs);

			
 
				-    free(galloc);

			
 
				-}

			
 
				-

			
 
				-typedef struct ggml_gallocr * ggml_gallocr_t;

			
 
				-

			
 
				-static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {

			
 
				-    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);

			
 
				-    return &galloc->hash_values[i];

			
 
				-}

			
 
				-

			
 
				-static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {

			
 
				-    return ggml_gallocr_hash_get(galloc, t)->allocated;

			
 
				-}

			
 
				-

			
 
				-static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {

			
 
				-    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);

			
 
				-    hn->buffer_id = buffer_id;

			
 
				-    hn->offset = offset;

			
 
				-    hn->allocated = true;

			
 
				-}

			
 
				-

			
 
				-static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {

			
 
				-    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;

			
 
				-}

			
 
				-

			
 
				-static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {

			
 
				-    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);

			
 
				-

			
 
				-    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {

			
 
				-        hn->allocated = true;

			
 
				-        assert(hn->offset == 0);

			
 
				-

			
 
				-        // try to reuse a parent's buffer (inplace)

			
 
				-        if (ggml_op_can_inplace(node->op)) {

			
 
				-            for (int i = 0; i < GGML_MAX_SRC; i++) {

			
 
				-                struct ggml_tensor * parent = node->src[i];

			
 
				-                if (parent == NULL) {

			
 
				-                    continue;

			
 
				-                }

			
 
				-

			
 
				-                // if the node's data is external, then we cannot re-use it

			
 
				-                if (!ggml_gallocr_is_own(galloc, parent)) {

			
 
				-                    AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);

			
 
				-                    continue;

			
 
				-                }

			
 
				-

			
 
				-                // outputs cannot be reused

			
 
				-                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {

			
 
				-                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);

			
 
				-                    continue;

			
 
				-                }

			
 
				-

			
 
				-                if (!ggml_are_same_layout(node, parent)) {

			
 
				-                    AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);

			
 
				-                    continue;

			
 
				-                }

			
 
				-

			
 
				-                struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);

			
 
				-                if (p_hn->n_children == 1 && p_hn->n_views == 0) {

			
 
				-                    if (ggml_is_view(parent)) {

			
 
				-                        struct ggml_tensor * view_src = parent->view_src;

			
 
				-                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);

			
 
				-                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {

			
 
				-                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);

			
 
				-                            assert(view_src_hn->offset == p_hn->offset);

			
 
				-                            hn->buffer_id = p_hn->buffer_id;

			
 
				-                            hn->offset = p_hn->offset;

			
 
				-                            p_hn->allocated = false; // avoid freeing the parent

			
 
				-                            view_src_hn->allocated = false;

			
 
				-                            return;

			
 
				-                        }

			
 
				-                    } else {

			
 
				-                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);

			
 
				-                        hn->buffer_id = p_hn->buffer_id;

			
 
				-                        hn->offset = p_hn->offset;

			
 
				-                        p_hn->allocated = false; // avoid freeing the parent

			
 
				-                        return;

			
 
				-                    }

			
 
				-                }

			
 
				-            }

			
 
				-        }

			
 
				-        // allocate tensor from the buffer

			
 
				-        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];

			
 
				-        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];

			
 
				-        size_t size = ggml_backend_buft_get_alloc_size(buft, node);

			
 
				-        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);

			
 
				-        hn->buffer_id = buffer_id;

			
 
				-        hn->offset = offset;

			
 
				-        return;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {

			
 
				-    // graph outputs are never freed

			
 
				-    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {

			
 
				-        AT_PRINTF("not freeing output %s\n", node->name);

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];

			
 
				-    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];

			
 
				-    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);

			
 
				-    size_t offset = hn->offset;

			
 
				-    size_t size = ggml_backend_buft_get_alloc_size(buft, node);

			
 
				-    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);

			
 
				-    hn->allocated = false;

			
 
				-}

			
 
				-

			
 
				-static int get_node_buffer_id(const int * node_buffer_ids, int i) {

			
 
				-    return node_buffer_ids ? node_buffer_ids[i] : 0;

			
 
				-}

			
 
				-

			
 
				-static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {

			
 
				-    // clear hash tables

			
 
				-    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));

			
 
				-    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));

			
 
				-

			
 
				-    // allocate leafs

			
 
				-    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes

			
 
				-    for (int i = 0; i < graph->n_leafs; i++) {

			
 
				-        struct ggml_tensor * leaf = graph->leafs[i];

			
 
				-        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));

			
 
				-    }

			
 
				-

			
 
				-    // count number of children and views

			
 
				-    // allocate other graph inputs and leafs first to avoid overwriting them

			
 
				-    for (int i = 0; i < graph->n_nodes; i++) {

			
 
				-        struct ggml_tensor * node = graph->nodes[i];

			
 
				-

			
 
				-        // TODO: better way to add external dependencies

			
 
				-        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to

			
 
				-        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node

			
 
				-        // itself is never used and should not be considered a dependency

			
 
				-        if (ggml_is_view(node) && node->op != GGML_OP_NONE) {

			
 
				-            struct ggml_tensor * view_src = node->view_src;

			
 
				-            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;

			
 
				-        }

			
 
				-

			
 
				-        if (node->flags & GGML_TENSOR_FLAG_INPUT) {

			
 
				-            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));

			
 
				-        }

			
 
				-

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * src = node->src[j];

			
 
				-            if (src == NULL) {

			
 
				-                continue;

			
 
				-            }

			
 
				-

			
 
				-            ggml_gallocr_hash_get(galloc, src)->n_children += 1;

			
 
				-

			
 
				-            // allocate explicit inputs

			
 
				-            if (src->flags & GGML_TENSOR_FLAG_INPUT) {

			
 
				-                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // allocate tensors

			
 
				-    for (int i = 0; i < graph->n_nodes; i++) {

			
 
				-        struct ggml_tensor * node = graph->nodes[i];

			
 
				-        int buffer_id = get_node_buffer_id(node_buffer_ids, i);

			
 
				-

			
 
				-        // allocate parents (only leafs need to be allocated at this point)

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * parent = node->src[j];

			
 
				-            if (parent == NULL) {

			
 
				-                continue;

			
 
				-            }

			
 
				-            ggml_gallocr_allocate_node(galloc, parent, buffer_id);

			
 
				-        }

			
 
				-

			
 
				-        // allocate node

			
 
				-        ggml_gallocr_allocate_node(galloc, node, buffer_id);

			
 
				-

			
 
				-        AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * parent = node->src[j];

			
 
				-            if (parent == NULL) {

			
 
				-                continue;

			
 
				-            }

			
 
				-            AT_PRINTF("%s", parent->name);

			
 
				-            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {

			
 
				-                AT_PRINTF(", ");

			
 
				-            }

			
 
				-        }

			
 
				-        AT_PRINTF("\n");

			
 
				-

			
 
				-        // update parents

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * parent = node->src[j];

			
 
				-            if (parent == NULL) {

			
 
				-                continue;

			
 
				-            }

			
 
				-            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);

			
 
				-            p_hn->n_children -= 1;

			
 
				-

			
 
				-            AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",

			
 
				-                parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);

			
 
				-

			
 
				-            if (p_hn->n_children == 0 && p_hn->n_views == 0) {

			
 
				-                if (ggml_is_view(parent)) {

			
 
				-                    struct ggml_tensor * view_src = parent->view_src;

			
 
				-                    struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);

			
 
				-                    view_src_hn->n_views -= 1;

			
 
				-                    AT_PRINTF("view_src %s: %d children, %d views\n",

			
 
				-                        view_src->name, view_src_hn->n_children, view_src_hn->n_views);

			
 
				-                    if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {

			
 
				-                        ggml_gallocr_free_node(galloc, view_src, buffer_id);

			
 
				-                    }

			
 
				-                }

			
 
				-                else if (p_hn->allocated) {

			
 
				-                    ggml_gallocr_free_node(galloc, parent, buffer_id);

			
 
				-                }

			
 
				-            }

			
 
				-            AT_PRINTF("\n");

			
 
				-        }

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {

			
 
				-    size_t hash_size = graph->visited_hash_table.size;

			
 
				-

			
 
				-    // initialize hash table

			
 
				-    if (galloc->hash_set.size < hash_size) {

			
 
				-        free(galloc->hash_set.keys);

			
 
				-        free(galloc->hash_values);

			
 
				-        galloc->hash_set.size = hash_size;

			
 
				-        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));

			
 
				-        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));

			
 
				-        GGML_ASSERT(galloc->hash_set.keys != NULL);

			
 
				-        GGML_ASSERT(galloc->hash_values != NULL);

			
 
				-    } else {

			
 
				-        // reset hash table

			
 
				-        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);

			
 
				-        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);

			
 
				-    }

			
 
				-

			
 
				-    // reset allocators

			
 
				-    for (int i = 0; i < galloc->n_buffers; i++) {

			
 
				-        ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);

			
 
				-    }

			
 
				-

			
 
				-    // allocate in hash table

			
 
				-    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);

			
 
				-

			
 
				-    // set the node_allocs from the hash table

			
 
				-    if (galloc->n_nodes < graph->n_nodes) {

			
 
				-        free(galloc->node_allocs);

			
 
				-        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));

			
 
				-        GGML_ASSERT(galloc->node_allocs != NULL);

			
 
				-    }

			
 
				-    galloc->n_nodes = graph->n_nodes;

			
 
				-    for (int i = 0; i < graph->n_nodes; i++) {

			
 
				-        struct ggml_tensor * node = graph->nodes[i];

			
 
				-        struct node_alloc * node_alloc = &galloc->node_allocs[i];

			
 
				-        node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);

			
 
				-        if (node->view_src || node->data) {

			
 
				-            node_alloc->dst.offset = SIZE_MAX;

			
 
				-            node_alloc->dst.size_max = 0;

			
 
				-        } else {

			
 
				-            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);

			
 
				-            node_alloc->dst.offset   = hn->offset;

			
 
				-            node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);

			
 
				-        }

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * src = node->src[j];

			
 
				-            if (!src || src->view_src || src->data) {

			
 
				-                node_alloc->src[j].offset = SIZE_MAX;

			
 
				-                node_alloc->src[j].size_max = 0;

			
 
				-            } else {

			
 
				-                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);

			
 
				-                node_alloc->src[j].offset   = hn->offset;

			
 
				-                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-    if (galloc->n_leafs < graph->n_leafs) {

			
 
				-        free(galloc->leaf_allocs);

			
 
				-        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));

			
 
				-        GGML_ASSERT(galloc->leaf_allocs != NULL);

			
 
				-    }

			
 
				-    galloc->n_leafs = graph->n_leafs;

			
 
				-    for (int i = 0; i < graph->n_leafs; i++) {

			
 
				-        struct ggml_tensor * leaf = graph->leafs[i];

			
 
				-        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);

			
 
				-        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;

			
 
				-        if (leaf->view_src || leaf->data) {

			
 
				-            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;

			
 
				-            galloc->leaf_allocs[i].leaf.size_max = 0;

			
 
				-        } else {

			
 
				-            galloc->leaf_allocs[i].leaf.offset = hn->offset;

			
 
				-            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // reallocate buffers if needed

			
 
				-    for (int i = 0; i < galloc->n_buffers; i++) {

			
 
				-        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;

			
 
				-        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);

			
 
				-

			
 
				-        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views

			
 
				-        if (new_size > cur_size || galloc->buffers[i] == NULL) {

			
 
				-#ifndef NDEBUG

			
 
				-            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);

			
 
				-#endif

			
 
				-            ggml_backend_buffer_free(galloc->buffers[i]);

			
 
				-            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);

			
 
				-            if (galloc->buffers[i] == NULL) {

			
 
				-                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);

			
 
				-                return false;

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    return true;

			
 
				-}

			
 
				-

			
 
				-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {

			
 
				-    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);

			
 
				-}

			
 
				-

			
 
				-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {

			
 
				-    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);

			
 
				-

			
 
				-    if (tensor->view_src != NULL) {

			
 
				-        if (tensor->buffer == NULL) {

			
 
				-            assert(tensor_alloc->offset == SIZE_MAX);

			
 
				-            if (tensor->view_src->buffer == NULL) {

			
 
				-                // this tensor was allocated without ggml-backend

			
 
				-                return;

			
 
				-            }

			
 
				-            ggml_backend_view_init(galloc->buffers[buffer_id], tensor);

			
 
				-        }

			
 
				-    } else {

			
 
				-        if (tensor->data == NULL) {

			
 
				-            assert(tensor_alloc->offset != SIZE_MAX);

			
 
				-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);

			
 
				-            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);

			
 
				-            void * addr = (char *)base + tensor_alloc->offset;

			
 
				-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);

			
 
				-        } else {

			
 
				-            if (tensor->buffer == NULL) {

			
 
				-                // this tensor was allocated without ggml-backend

			
 
				-                return;

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {

			
 
				-    ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];

			
 
				-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);

			
 
				-    return talloc->size_max >= node_size;

			
 
				-}

			
 
				-

			
 
				-static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {

			
 
				-    if (galloc->n_nodes != graph->n_nodes) {

			
 
				-#ifndef NDEBUG

			
 
				-        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);

			
 
				-#endif

			
 
				-        return true;

			
 
				-    }

			
 
				-

			
 
				-    if (galloc->n_leafs != graph->n_leafs) {

			
 
				-#ifndef NDEBUG

			
 
				-        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);

			
 
				-#endif

			
 
				-        return true;

			
 
				-    }

			
 
				-

			
 
				-    for (int i = 0; i < graph->n_nodes; i++) {

			
 
				-        struct ggml_tensor * node = graph->nodes[i];

			
 
				-        struct node_alloc * node_alloc = &galloc->node_allocs[i];

			
 
				-

			
 
				-        if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {

			
 
				-#ifndef NDEBUG

			
 
				-            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);

			
 
				-#endif

			
 
				-            return true;

			
 
				-        }

			
 
				-

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * src = node->src[j];

			
 
				-            if (src == NULL) {

			
 
				-                continue;

			
 
				-            }

			
 
				-            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {

			
 
				-#ifndef NDEBUG

			
 
				-                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);

			
 
				-#endif

			
 
				-                return true;

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    return false;

			
 
				-}

			
 
				-

			
 
				-bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {

			
 
				-    if (ggml_gallocr_needs_realloc(galloc, graph)) {

			
 
				-        if (galloc->n_buffers == 1) {

			
 
				-#ifndef NDEBUG

			
 
				-            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);

			
 
				-#endif

			
 
				-            if (!ggml_gallocr_reserve(galloc, graph)) {

			
 
				-                return false;

			
 
				-            }

			
 
				-        } else {

			
 
				-#ifndef NDEBUG

			
 
				-            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);

			
 
				-#endif

			
 
				-            return false;

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // reset buffers

			
 
				-    for (int i = 0; i < galloc->n_buffers; i++) {

			
 
				-        if (galloc->buffers[i] != NULL) {

			
 
				-            ggml_backend_buffer_reset(galloc->buffers[i]);

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // allocate the graph tensors from the previous assignments

			
 
				-    // leafs

			
 
				-    for (int i = 0; i < graph->n_leafs; i++) {

			
 
				-        struct ggml_tensor * leaf = graph->leafs[i];

			
 
				-        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];

			
 
				-        ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);

			
 
				-    }

			
 
				-    // nodes

			
 
				-    for (int i = 0; i < graph->n_nodes; i++) {

			
 
				-        struct ggml_tensor * node = graph->nodes[i];

			
 
				-        struct node_alloc * node_alloc = &galloc->node_allocs[i];

			
 
				-        for (int j = 0; j < GGML_MAX_SRC; j++) {

			
 
				-            struct ggml_tensor * src = node->src[j];

			
 
				-            if (src == NULL) {

			
 
				-                continue;

			
 
				-            }

			
 
				-            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);

			
 
				-        }

			
 
				-        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);

			
 
				-    }

			
 
				-

			
 
				-    return true;

			
 
				-}

			
 
				-

			
 
				-size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {

			
 
				-    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);

			
 
				-

			
 
				-    if (galloc->buffers[buffer_id] == NULL) {

			
 
				-        return 0;

			
 
				-    }

			
 
				-    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);

			
 
				-}

			
 
				-

			
 
				-// utils

			
 
				-

			
 
				-static bool alloc_tensor_range(struct ggml_context * ctx,

			
 
				-        struct ggml_tensor * first, struct ggml_tensor * last,

			
 
				-        ggml_backend_buffer_type_t buft, size_t size,

			
 
				-        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {

			
 
				-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);

			
 
				-    if (buffer == NULL) {

			
 
				-#ifndef NDEBUG

			
 
				-        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);

			
 
				-#endif

			
 
				-        for (size_t i = 0; i < *n_buffers; i++) {

			
 
				-            ggml_backend_buffer_free(*buffers[i]);

			
 
				-        }

			
 
				-        free(*buffers);

			
 
				-        return false;

			
 
				-    }

			
 
				-

			
 
				-    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);

			
 
				-

			
 
				-    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {

			
 
				-        if (t->data == NULL) {

			
 
				-            if (t->view_src == NULL) {

			
 
				-                ggml_tallocr_alloc(&tallocr, t);

			
 
				-            } else if (t->buffer == NULL) {

			
 
				-                ggml_backend_view_init(buffer, t);

			
 
				-            }

			
 
				-        } else {

			
 
				-            if (t->view_src != NULL && t->buffer == NULL) {

			
 
				-                // view of a pre-allocated tensor

			
 
				-                ggml_backend_view_init(buffer, t);

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));

			
 
				-    (*buffers)[(*n_buffers)++] = buffer;

			
 
				-

			
 
				-    return true;

			
 
				-}

			
 
				-

			
 
				-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {

			
 
				-    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);

			
 
				-

			
 
				-    size_t alignment = ggml_backend_buft_get_alignment(buft);

			
 
				-    size_t max_size = ggml_backend_buft_get_max_size(buft);

			
 
				-

			
 
				-    ggml_backend_buffer_t * buffers = NULL;

			
 
				-    size_t n_buffers = 0;

			
 
				-

			
 
				-    size_t cur_buf_size = 0;

			
 
				-    struct ggml_tensor * first = ggml_get_first_tensor(ctx);

			
 
				-    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {

			
 
				-        size_t this_size = 0;

			
 
				-        if (t->data == NULL && t->view_src == NULL) {

			
 
				-            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);

			
 
				-        }

			
 
				-

			
 
				-        if (this_size > max_size) {

			
 
				-            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",

			
 
				-                    __func__, t->name,

			
 
				-                    ggml_backend_buft_name(buft),

			
 
				-                    this_size, max_size);

			
 
				-            for (size_t i = 0; i < n_buffers; i++) {

			
 
				-                ggml_backend_buffer_free(buffers[i]);

			
 
				-            }

			
 
				-            free(buffers);

			
 
				-            return NULL;

			
 
				-        }

			
 
				-

			
 
				-        if ((cur_buf_size + this_size) > max_size) {

			
 
				-            // allocate tensors in the current buffer

			
 
				-            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {

			
 
				-                return NULL;

			
 
				-            }

			
 
				-            first = t;

			
 
				-            cur_buf_size = this_size;

			
 
				-        } else {

			
 
				-            cur_buf_size += this_size;

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // allocate remaining tensors

			
 
				-    if (cur_buf_size > 0) {

			
 
				-        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {

			
 
				-            return NULL;

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    if (n_buffers == 0) {

			
 
				-#ifndef NDEBUG

			
 
				-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);

			
 
				-#endif

			
 
				-        return NULL;

			
 
				-    }

			
 
				-

			
 
				-    ggml_backend_buffer_t buffer;

			
 
				-    if (n_buffers == 1) {

			
 
				-        buffer = buffers[0];

			
 
				-    } else {

			
 
				-        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);

			
 
				-    }

			
 
				-    free(buffers);

			
 
				-    return buffer;

			
 
				-}

			
 
				-

			
 
				-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {

			
 
				-    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));

			
 
				-}

			
 
				+#include "ggml-alloc.h"
			
 
				+#include "ggml-backend-impl.h"
			
 
				+#include "ggml.h"
			
 
				+#include "ggml-impl.h"
			
 
				+#include <assert.h>
			
 
				+#include <limits.h>
			
 
				+#include <stdarg.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#define MAX(a, b) ((a) > (b) ? (a) : (b))
			
 
				+#define MAX_FREE_BLOCKS 256
			
 
				+
			
 
				+//#define GGML_ALLOCATOR_DEBUG
			
 
				+
			
 
				+//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
			
 
				+#define AT_PRINTF(...)
			
 
				+
			
 
				+
			
 
				+static bool ggml_is_view(const struct ggml_tensor * t) {
			
 
				+    return t->view_src != NULL;
			
 
				+}
			
 
				+
			
 
				+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
			
 
				+    if (a->type != b->type) {
			
 
				+        return false;
			
 
				+    }
			
 
				+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
			
 
				+        if (a->ne[i] != b->ne[i]) {
			
 
				+            return false;
			
 
				+        }
			
 
				+        if (a->nb[i] != b->nb[i]) {
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+static bool ggml_op_can_inplace(enum ggml_op op) {
			
 
				+    switch (op) {
			
 
				+        case GGML_OP_SCALE:
			
 
				+        case GGML_OP_DIAG_MASK_ZERO:
			
 
				+        case GGML_OP_DIAG_MASK_INF:
			
 
				+        case GGML_OP_ADD:
			
 
				+        case GGML_OP_ADD1:
			
 
				+        case GGML_OP_SUB:
			
 
				+        case GGML_OP_MUL:
			
 
				+        case GGML_OP_DIV:
			
 
				+        case GGML_OP_SQR:
			
 
				+        case GGML_OP_SQRT:
			
 
				+        case GGML_OP_LOG:
			
 
				+        case GGML_OP_UNARY:
			
 
				+        case GGML_OP_ROPE:
			
 
				+        case GGML_OP_RMS_NORM:
			
 
				+        case GGML_OP_SOFT_MAX:
			
 
				+            return true;
			
 
				+
			
 
				+        default:
			
 
				+            return false;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
			
 
				+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
			
 
				+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
			
 
				+    return offset + align;
			
 
				+}
			
 
				+
			
 
				+// tallocr
			
 
				+
			
 
				+struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
			
 
				+    void * base = ggml_backend_buffer_get_base(buffer);
			
 
				+    size_t align = ggml_backend_buffer_get_alignment(buffer);
			
 
				+
			
 
				+    assert(align && !(align & (align - 1))); // power of 2
			
 
				+
			
 
				+    struct ggml_tallocr talloc = (struct ggml_tallocr) {
			
 
				+        /*.buffer    = */ buffer,
			
 
				+        /*.base      = */ base,
			
 
				+        /*.alignment = */ align,
			
 
				+        /*.offset    = */ aligned_offset(base, 0, align),
			
 
				+    };
			
 
				+    return talloc;
			
 
				+}
			
 
				+
			
 
				+void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
			
 
				+    size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
			
 
				+    size = GGML_PAD(size, talloc->alignment);
			
 
				+
			
 
				+    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
			
 
				+        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
			
 
				+                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
			
 
				+        GGML_ASSERT(!"not enough space in the buffer");
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
			
 
				+    talloc->offset += size;
			
 
				+
			
 
				+    assert(((uintptr_t)addr % talloc->alignment) == 0);
			
 
				+
			
 
				+    ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
			
 
				+}
			
 
				+
			
 
				+// dynamic tensor allocator
			
 
				+
			
 
				+struct free_block {
			
 
				+    size_t offset;
			
 
				+    size_t size;
			
 
				+};
			
 
				+
			
 
				+struct ggml_dyn_tallocr {
			
 
				+    size_t alignment;
			
 
				+    int n_free_blocks;
			
 
				+    struct free_block free_blocks[MAX_FREE_BLOCKS];
			
 
				+    size_t max_size;
			
 
				+
			
 
				+#ifdef GGML_ALLOCATOR_DEBUG
			
 
				+    struct {
			
 
				+        const struct ggml_tensor * tensor;
			
 
				+        size_t offset;
			
 
				+    } allocated_tensors[1024];
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+#ifdef GGML_ALLOCATOR_DEBUG
			
 
				+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
			
 
				+    for (int i = 0; i < 1024; i++) {
			
 
				+        if (alloc->allocated_tensors[i].tensor == NULL) {
			
 
				+            alloc->allocated_tensors[i].tensor = tensor;
			
 
				+            alloc->allocated_tensors[i].offset = offset;
			
 
				+            return;
			
 
				+        }
			
 
				+    }
			
 
				+    GGML_ASSERT(!"out of allocated_tensors");
			
 
				+}
			
 
				+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
			
 
				+    for (int i = 0; i < 1024; i++) {
			
 
				+        if (alloc->allocated_tensors[i].offset == offset) {
			
 
				+            alloc->allocated_tensors[i].tensor = NULL;
			
 
				+            return;
			
 
				+        }
			
 
				+    }
			
 
				+    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
			
 
				+    GGML_ASSERT(!"tensor not found");
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
			
 
				+    size = aligned_offset(NULL, size, alloc->alignment);
			
 
				+
			
 
				+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
			
 
				+
			
 
				+    size_t max_avail = 0;
			
 
				+
			
 
				+    // find the best fitting free block besides the last block
			
 
				+    int best_fit_block = -1;
			
 
				+    size_t best_fit_size = SIZE_MAX;
			
 
				+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
			
 
				+        struct free_block * block = &alloc->free_blocks[i];
			
 
				+        max_avail = MAX(max_avail, block->size);
			
 
				+        if (block->size >= size && block->size <= best_fit_size) {
			
 
				+            best_fit_block = i;
			
 
				+            best_fit_size = block->size;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (best_fit_block == -1) {
			
 
				+        // the last block is our last resort
			
 
				+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
			
 
				+        max_avail = MAX(max_avail, block->size);
			
 
				+        if (block->size >= size) {
			
 
				+            best_fit_block = alloc->n_free_blocks - 1;
			
 
				+        } else {
			
 
				+            // this should never happen
			
 
				+            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
			
 
				+                    __func__, size, max_avail);
			
 
				+            GGML_ASSERT(!"not enough space in the buffer");
			
 
				+            GGML_UNREACHABLE();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    struct free_block * block = &alloc->free_blocks[best_fit_block];
			
 
				+    size_t offset = block->offset;
			
 
				+    block->offset = offset + size;
			
 
				+    block->size -= size;
			
 
				+    if (block->size == 0) {
			
 
				+        // remove block if empty
			
 
				+        alloc->n_free_blocks--;
			
 
				+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
			
 
				+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
			
 
				+
			
 
				+#ifdef GGML_ALLOCATOR_DEBUG
			
 
				+    add_allocated_tensor(alloc, offset, tensor);
			
 
				+    size_t cur_max = offset + size;
			
 
				+    if (cur_max > alloc->max_size) {
			
 
				+        // sort allocated_tensors by offset
			
 
				+        for (int i = 0; i < 1024; i++) {
			
 
				+            for (int j = i + 1; j < 1024; j++) {
			
 
				+                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
			
 
				+                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
			
 
				+                    size_t tmp_offset = alloc->allocated_tensors[i].offset;
			
 
				+                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
			
 
				+                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
			
 
				+                    alloc->allocated_tensors[j].tensor = tmp_tensor;
			
 
				+                    alloc->allocated_tensors[j].offset = tmp_offset;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
			
 
				+        for (int i = 0; i < 1024; i++) {
			
 
				+            if (alloc->allocated_tensors[i].tensor) {
			
 
				+                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
			
 
				+                    alloc->allocated_tensors[i].offset,
			
 
				+                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
			
 
				+                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
			
 
				+            }
			
 
				+        }
			
 
				+        fprintf(stderr, "\n");
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    alloc->max_size = MAX(alloc->max_size, offset + size);
			
 
				+
			
 
				+    return offset;
			
 
				+
			
 
				+    GGML_UNUSED(tensor);
			
 
				+}
			
 
				+
			
 
				+// this is a very naive implementation, but for our case the number of free blocks should be very small
			
 
				+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
			
 
				+    size = aligned_offset(NULL, size, alloc->alignment);
			
 
				+
			
 
				+    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
			
 
				+
			
 
				+#ifdef GGML_ALLOCATOR_DEBUG
			
 
				+    remove_allocated_tensor(alloc, offset, tensor);
			
 
				+#endif
			
 
				+
			
 
				+    // see if we can merge with an existing block
			
 
				+    for (int i = 0; i < alloc->n_free_blocks; i++) {
			
 
				+        struct free_block * block = &alloc->free_blocks[i];
			
 
				+        // check if ptr is at the end of the block
			
 
				+        if (block->offset + block->size == offset) {
			
 
				+            block->size += size;
			
 
				+            // check if we can merge with the next block
			
 
				+            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
			
 
				+                block->size += alloc->free_blocks[i+1].size;
			
 
				+                alloc->n_free_blocks--;
			
 
				+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
			
 
				+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
			
 
				+                }
			
 
				+            }
			
 
				+            return;
			
 
				+        }
			
 
				+        // check if ptr is at the beginning of the block
			
 
				+        if (offset + size == block->offset) {
			
 
				+            block->offset = offset;
			
 
				+            block->size += size;
			
 
				+            // check if we can merge with the previous block
			
 
				+            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
			
 
				+                alloc->free_blocks[i-1].size += block->size;
			
 
				+                alloc->n_free_blocks--;
			
 
				+                for (int j = i; j < alloc->n_free_blocks; j++) {
			
 
				+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
			
 
				+                }
			
 
				+            }
			
 
				+            return;
			
 
				+        }
			
 
				+    }
			
 
				+    // otherwise, add a new block
			
 
				+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
			
 
				+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
			
 
				+    int insert_pos = 0;
			
 
				+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
			
 
				+        insert_pos++;
			
 
				+    }
			
 
				+    // shift all blocks from insert_pos onward to make room for the new block
			
 
				+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
			
 
				+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
			
 
				+    }
			
 
				+    // insert the new block
			
 
				+    alloc->free_blocks[insert_pos].offset = offset;
			
 
				+    alloc->free_blocks[insert_pos].size = size;
			
 
				+    alloc->n_free_blocks++;
			
 
				+
			
 
				+    GGML_UNUSED(tensor);
			
 
				+}
			
 
				+
			
 
				+static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
			
 
				+    alloc->n_free_blocks = 1;
			
 
				+    alloc->free_blocks[0].offset = 0;
			
 
				+    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
			
 
				+    alloc->max_size = 0;
			
 
				+}
			
 
				+
			
 
				+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
			
 
				+    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
			
 
				+
			
 
				+    *alloc = (struct ggml_dyn_tallocr) {
			
 
				+        /*.alignment     = */ alignment,
			
 
				+        /*.n_free_blocks = */ 0,
			
 
				+        /*.free_blocks   = */ {{0}},
			
 
				+        /*.max_size      = */ 0,
			
 
				+#ifdef GGML_ALLOCATOR_DEBUG
			
 
				+        /*.allocated_tensors = */ {{0}},
			
 
				+#endif
			
 
				+    };
			
 
				+
			
 
				+    ggml_dyn_tallocr_reset(alloc);
			
 
				+
			
 
				+    return alloc;
			
 
				+}
			
 
				+
			
 
				+static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
			
 
				+    free(alloc);
			
 
				+}
			
 
				+
			
 
				+static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
			
 
				+    return alloc->max_size;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/////////////////////////////////////
			
 
				+
			
 
				+// graph allocator
			
 
				+
			
 
				+struct hash_node {
			
 
				+    int n_children;
			
 
				+    int n_views;
			
 
				+    int buffer_id;
			
 
				+    size_t offset; // offset within the buffer
			
 
				+    bool allocated;
			
 
				+};
			
 
				+
			
 
				+struct tensor_alloc {
			
 
				+    size_t offset;
			
 
				+    size_t size_max; // 0 = pre-allocated, unused, or view
			
 
				+};
			
 
				+
			
 
				+struct leaf_alloc {
			
 
				+    int buffer_id;
			
 
				+    struct tensor_alloc leaf;
			
 
				+};
			
 
				+
			
 
				+struct node_alloc {
			
 
				+    int buffer_id;
			
 
				+    struct tensor_alloc dst;
			
 
				+    struct tensor_alloc src[GGML_MAX_SRC];
			
 
				+};
			
 
				+
			
 
				+struct ggml_gallocr {
			
 
				+    ggml_backend_buffer_type_t * bufts; // [n_buffers]
			
 
				+    ggml_backend_buffer_t * buffers; // [n_buffers]
			
 
				+    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
			
 
				+    int n_buffers;
			
 
				+
			
 
				+    struct ggml_hash_set hash_set;
			
 
				+    struct hash_node * hash_values; // [hash_set.size]
			
 
				+
			
 
				+    struct node_alloc * node_allocs; // [n_nodes]
			
 
				+    int n_nodes;
			
 
				+
			
 
				+    struct leaf_alloc * leaf_allocs; // [n_leafs]
			
 
				+    int n_leafs;
			
 
				+};
			
 
				+
			
 
				+ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
			
 
				+    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
			
 
				+    GGML_ASSERT(galloc != NULL);
			
 
				+
			
 
				+    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
			
 
				+    GGML_ASSERT(galloc->bufts != NULL);
			
 
				+
			
 
				+    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
			
 
				+    GGML_ASSERT(galloc->buffers != NULL);
			
 
				+
			
 
				+    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
			
 
				+    GGML_ASSERT(galloc->buf_tallocs != NULL);
			
 
				+
			
 
				+    for (int i = 0; i < n_bufs; i++) {
			
 
				+        galloc->bufts[i] = bufts[i];
			
 
				+        galloc->buffers[i] = NULL;
			
 
				+        size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
			
 
				+        galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
			
 
				+    }
			
 
				+    galloc->n_buffers = n_bufs;
			
 
				+
			
 
				+    return galloc;
			
 
				+}
			
 
				+
			
 
				+ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
			
 
				+    return ggml_gallocr_new_n(&buft, 1);
			
 
				+}
			
 
				+
			
 
				+void ggml_gallocr_free(ggml_gallocr_t galloc) {
			
 
				+    if (galloc == NULL) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < galloc->n_buffers; i++) {
			
 
				+        if (galloc->buffers != NULL) {
			
 
				+            ggml_backend_buffer_free(galloc->buffers[i]);
			
 
				+        }
			
 
				+        if (galloc->buf_tallocs != NULL) {
			
 
				+            ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    free(galloc->hash_set.keys);
			
 
				+    free(galloc->hash_values);
			
 
				+    free(galloc->bufts);
			
 
				+    free(galloc->buffers);
			
 
				+    free(galloc->buf_tallocs);
			
 
				+    free(galloc->node_allocs);
			
 
				+    free(galloc->leaf_allocs);
			
 
				+    free(galloc);
			
 
				+}
			
 
				+
			
 
				+typedef struct ggml_gallocr * ggml_gallocr_t;
			
 
				+
			
 
				+static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
			
 
				+    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
			
 
				+    return &galloc->hash_values[i];
			
 
				+}
			
 
				+
			
 
				+static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
			
 
				+    return ggml_gallocr_hash_get(galloc, t)->allocated;
			
 
				+}
			
 
				+
			
 
				+static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
			
 
				+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
			
 
				+    hn->buffer_id = buffer_id;
			
 
				+    hn->offset = offset;
			
 
				+    hn->allocated = true;
			
 
				+}
			
 
				+
			
 
				+static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
			
 
				+    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
			
 
				+}
			
 
				+
			
 
				+static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
			
 
				+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
			
 
				+
			
 
				+    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
			
 
				+        hn->allocated = true;
			
 
				+        assert(hn->offset == 0);
			
 
				+
			
 
				+        // try to reuse a parent's buffer (inplace)
			
 
				+        if (ggml_op_can_inplace(node->op)) {
			
 
				+            for (int i = 0; i < GGML_MAX_SRC; i++) {
			
 
				+                struct ggml_tensor * parent = node->src[i];
			
 
				+                if (parent == NULL) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+
			
 
				+                // if the node's data is external, then we cannot re-use it
			
 
				+                if (!ggml_gallocr_is_own(galloc, parent)) {
			
 
				+                    AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
			
 
				+                    continue;
			
 
				+                }
			
 
				+
			
 
				+                // outputs cannot be reused
			
 
				+                if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
			
 
				+                    AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
			
 
				+                    continue;
			
 
				+                }
			
 
				+
			
 
				+                if (!ggml_are_same_layout(node, parent)) {
			
 
				+                    AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
			
 
				+                    continue;
			
 
				+                }
			
 
				+
			
 
				+                struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
			
 
				+                if (p_hn->n_children == 1 && p_hn->n_views == 0) {
			
 
				+                    if (ggml_is_view(parent)) {
			
 
				+                        struct ggml_tensor * view_src = parent->view_src;
			
 
				+                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
			
 
				+                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
			
 
				+                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
			
 
				+                            assert(view_src_hn->offset == p_hn->offset);
			
 
				+                            hn->buffer_id = p_hn->buffer_id;
			
 
				+                            hn->offset = p_hn->offset;
			
 
				+                            p_hn->allocated = false; // avoid freeing the parent
			
 
				+                            view_src_hn->allocated = false;
			
 
				+                            return;
			
 
				+                        }
			
 
				+                    } else {
			
 
				+                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
			
 
				+                        hn->buffer_id = p_hn->buffer_id;
			
 
				+                        hn->offset = p_hn->offset;
			
 
				+                        p_hn->allocated = false; // avoid freeing the parent
			
 
				+                        return;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        // allocate tensor from the buffer
			
 
				+        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
			
 
				+        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
			
 
				+        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
			
 
				+        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
			
 
				+        hn->buffer_id = buffer_id;
			
 
				+        hn->offset = offset;
			
 
				+        return;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
			
 
				+    // graph outputs are never freed
			
 
				+    if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
			
 
				+        AT_PRINTF("not freeing output %s\n", node->name);
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
			
 
				+    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
			
 
				+    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
			
 
				+    size_t offset = hn->offset;
			
 
				+    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
			
 
				+    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
			
 
				+    hn->allocated = false;
			
 
				+}
			
 
				+
			
 
				+static int get_node_buffer_id(const int * node_buffer_ids, int i) {
			
 
				+    return node_buffer_ids ? node_buffer_ids[i] : 0;
			
 
				+}
			
 
				+
			
 
				+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
			
 
				+    // clear hash tables
			
 
				+    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
			
 
				+    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
			
 
				+
			
 
				+    // allocate leafs
			
 
				+    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
			
 
				+    for (int i = 0; i < graph->n_leafs; i++) {
			
 
				+        struct ggml_tensor * leaf = graph->leafs[i];
			
 
				+        ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
			
 
				+    }
			
 
				+
			
 
				+    // count number of children and views
			
 
				+    // allocate other graph inputs and leafs first to avoid overwriting them
			
 
				+    for (int i = 0; i < graph->n_nodes; i++) {
			
 
				+        struct ggml_tensor * node = graph->nodes[i];
			
 
				+
			
 
				+        // TODO: better way to add external dependencies
			
 
				+        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
			
 
				+        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
			
 
				+        // itself is never used and should not be considered a dependency
			
 
				+        if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
			
 
				+            struct ggml_tensor * view_src = node->view_src;
			
 
				+            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
			
 
				+        }
			
 
				+
			
 
				+        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
			
 
				+            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
			
 
				+        }
			
 
				+
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * src = node->src[j];
			
 
				+            if (src == NULL) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
			
 
				+
			
 
				+            // allocate explicit inputs
			
 
				+            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
			
 
				+                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // allocate tensors
			
 
				+    for (int i = 0; i < graph->n_nodes; i++) {
			
 
				+        struct ggml_tensor * node = graph->nodes[i];
			
 
				+        int buffer_id = get_node_buffer_id(node_buffer_ids, i);
			
 
				+
			
 
				+        // allocate parents (only leafs need to be allocated at this point)
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * parent = node->src[j];
			
 
				+            if (parent == NULL) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            ggml_gallocr_allocate_node(galloc, parent, buffer_id);
			
 
				+        }
			
 
				+
			
 
				+        // allocate node
			
 
				+        ggml_gallocr_allocate_node(galloc, node, buffer_id);
			
 
				+
			
 
				+        AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * parent = node->src[j];
			
 
				+            if (parent == NULL) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            AT_PRINTF("%s", parent->name);
			
 
				+            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
			
 
				+                AT_PRINTF(", ");
			
 
				+            }
			
 
				+        }
			
 
				+        AT_PRINTF("\n");
			
 
				+
			
 
				+        // update parents
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * parent = node->src[j];
			
 
				+            if (parent == NULL) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
			
 
				+            p_hn->n_children -= 1;
			
 
				+
			
 
				+            AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
			
 
				+                parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
			
 
				+
			
 
				+            if (p_hn->n_children == 0 && p_hn->n_views == 0) {
			
 
				+                if (ggml_is_view(parent)) {
			
 
				+                    struct ggml_tensor * view_src = parent->view_src;
			
 
				+                    struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
			
 
				+                    view_src_hn->n_views -= 1;
			
 
				+                    AT_PRINTF("view_src %s: %d children, %d views\n",
			
 
				+                        view_src->name, view_src_hn->n_children, view_src_hn->n_views);
			
 
				+                    if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
			
 
				+                        ggml_gallocr_free_node(galloc, view_src, buffer_id);
			
 
				+                    }
			
 
				+                }
			
 
				+                else if (p_hn->allocated) {
			
 
				+                    ggml_gallocr_free_node(galloc, parent, buffer_id);
			
 
				+                }
			
 
				+            }
			
 
				+            AT_PRINTF("\n");
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
			
 
				+    size_t hash_size = graph->visited_hash_table.size;
			
 
				+
			
 
				+    // initialize hash table
			
 
				+    if (galloc->hash_set.size < hash_size) {
			
 
				+        free(galloc->hash_set.keys);
			
 
				+        free(galloc->hash_values);
			
 
				+        galloc->hash_set.size = hash_size;
			
 
				+        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
			
 
				+        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
			
 
				+        GGML_ASSERT(galloc->hash_set.keys != NULL);
			
 
				+        GGML_ASSERT(galloc->hash_values != NULL);
			
 
				+    } else {
			
 
				+        // reset hash table
			
 
				+        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
			
 
				+        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
			
 
				+    }
			
 
				+
			
 
				+    // reset allocators
			
 
				+    for (int i = 0; i < galloc->n_buffers; i++) {
			
 
				+        ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
			
 
				+    }
			
 
				+
			
 
				+    // allocate in hash table
			
 
				+    ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
			
 
				+
			
 
				+    // set the node_allocs from the hash table
			
 
				+    if (galloc->n_nodes < graph->n_nodes) {
			
 
				+        free(galloc->node_allocs);
			
 
				+        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
			
 
				+        GGML_ASSERT(galloc->node_allocs != NULL);
			
 
				+    }
			
 
				+    galloc->n_nodes = graph->n_nodes;
			
 
				+    for (int i = 0; i < graph->n_nodes; i++) {
			
 
				+        struct ggml_tensor * node = graph->nodes[i];
			
 
				+        struct node_alloc * node_alloc = &galloc->node_allocs[i];
			
 
				+        node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
			
 
				+        if (node->view_src || node->data) {
			
 
				+            node_alloc->dst.offset = SIZE_MAX;
			
 
				+            node_alloc->dst.size_max = 0;
			
 
				+        } else {
			
 
				+            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
			
 
				+            node_alloc->dst.offset   = hn->offset;
			
 
				+            node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
			
 
				+        }
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * src = node->src[j];
			
 
				+            if (!src || src->view_src || src->data) {
			
 
				+                node_alloc->src[j].offset = SIZE_MAX;
			
 
				+                node_alloc->src[j].size_max = 0;
			
 
				+            } else {
			
 
				+                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
			
 
				+                node_alloc->src[j].offset   = hn->offset;
			
 
				+                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    if (galloc->n_leafs < graph->n_leafs) {
			
 
				+        free(galloc->leaf_allocs);
			
 
				+        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
			
 
				+        GGML_ASSERT(galloc->leaf_allocs != NULL);
			
 
				+    }
			
 
				+    galloc->n_leafs = graph->n_leafs;
			
 
				+    for (int i = 0; i < graph->n_leafs; i++) {
			
 
				+        struct ggml_tensor * leaf = graph->leafs[i];
			
 
				+        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
			
 
				+        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
			
 
				+        if (leaf->view_src || leaf->data) {
			
 
				+            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
			
 
				+            galloc->leaf_allocs[i].leaf.size_max = 0;
			
 
				+        } else {
			
 
				+            galloc->leaf_allocs[i].leaf.offset = hn->offset;
			
 
				+            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // reallocate buffers if needed
			
 
				+    for (int i = 0; i < galloc->n_buffers; i++) {
			
 
				+        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
			
 
				+        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
			
 
				+
			
 
				+        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
			
 
				+        if (new_size > cur_size || galloc->buffers[i] == NULL) {
			
 
				+#ifndef NDEBUG
			
 
				+            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
			
 
				+#endif
			
 
				+            ggml_backend_buffer_free(galloc->buffers[i]);
			
 
				+            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
			
 
				+            if (galloc->buffers[i] == NULL) {
			
 
				+                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
			
 
				+                return false;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
			
 
				+    return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
			
 
				+}
			
 
				+
			
 
				+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
			
 
				+    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
			
 
				+
			
 
				+    if (tensor->view_src != NULL) {
			
 
				+        if (tensor->buffer == NULL) {
			
 
				+            assert(tensor_alloc->offset == SIZE_MAX);
			
 
				+            if (tensor->view_src->buffer == NULL) {
			
 
				+                // this tensor was allocated without ggml-backend
			
 
				+                return;
			
 
				+            }
			
 
				+            ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
			
 
				+        }
			
 
				+    } else {
			
 
				+        if (tensor->data == NULL) {
			
 
				+            assert(tensor_alloc->offset != SIZE_MAX);
			
 
				+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
			
 
				+            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
			
 
				+            void * addr = (char *)base + tensor_alloc->offset;
			
 
				+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
			
 
				+        } else {
			
 
				+            if (tensor->buffer == NULL) {
			
 
				+                // this tensor was allocated without ggml-backend
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
			
 
				+    ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
			
 
				+    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
			
 
				+    return talloc->size_max >= node_size;
			
 
				+}
			
 
				+
			
 
				+static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
			
 
				+    if (galloc->n_nodes != graph->n_nodes) {
			
 
				+#ifndef NDEBUG
			
 
				+        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
			
 
				+#endif
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    if (galloc->n_leafs != graph->n_leafs) {
			
 
				+#ifndef NDEBUG
			
 
				+        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
			
 
				+#endif
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < graph->n_nodes; i++) {
			
 
				+        struct ggml_tensor * node = graph->nodes[i];
			
 
				+        struct node_alloc * node_alloc = &galloc->node_allocs[i];
			
 
				+
			
 
				+        if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
			
 
				+#ifndef NDEBUG
			
 
				+            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
			
 
				+#endif
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * src = node->src[j];
			
 
				+            if (src == NULL) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
			
 
				+#ifndef NDEBUG
			
 
				+                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
			
 
				+#endif
			
 
				+                return true;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return false;
			
 
				+}
			
 
				+
			
 
				+bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
			
 
				+    if (ggml_gallocr_needs_realloc(galloc, graph)) {
			
 
				+        if (galloc->n_buffers == 1) {
			
 
				+#ifndef NDEBUG
			
 
				+            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
			
 
				+#endif
			
 
				+            if (!ggml_gallocr_reserve(galloc, graph)) {
			
 
				+                return false;
			
 
				+            }
			
 
				+        } else {
			
 
				+#ifndef NDEBUG
			
 
				+            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
			
 
				+#endif
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // reset buffers
			
 
				+    for (int i = 0; i < galloc->n_buffers; i++) {
			
 
				+        if (galloc->buffers[i] != NULL) {
			
 
				+            ggml_backend_buffer_reset(galloc->buffers[i]);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // allocate the graph tensors from the previous assignments
			
 
				+    // leafs
			
 
				+    for (int i = 0; i < graph->n_leafs; i++) {
			
 
				+        struct ggml_tensor * leaf = graph->leafs[i];
			
 
				+        struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
			
 
				+        ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
			
 
				+    }
			
 
				+    // nodes
			
 
				+    for (int i = 0; i < graph->n_nodes; i++) {
			
 
				+        struct ggml_tensor * node = graph->nodes[i];
			
 
				+        struct node_alloc * node_alloc = &galloc->node_allocs[i];
			
 
				+        for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				+            struct ggml_tensor * src = node->src[j];
			
 
				+            if (src == NULL) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
			
 
				+        }
			
 
				+        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
			
 
				+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
			
 
				+
			
 
				+    if (galloc->buffers[buffer_id] == NULL) {
			
 
				+        return 0;
			
 
				+    }
			
 
				+    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
			
 
				+}
			
 
				+
			
 
				+// utils
			
 
				+
			
 
				+static bool alloc_tensor_range(struct ggml_context * ctx,
			
 
				+        struct ggml_tensor * first, struct ggml_tensor * last,
			
 
				+        ggml_backend_buffer_type_t buft, size_t size,
			
 
				+        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
			
 
				+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
			
 
				+    if (buffer == NULL) {
			
 
				+#ifndef NDEBUG
			
 
				+        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
			
 
				+#endif
			
 
				+        for (size_t i = 0; i < *n_buffers; i++) {
			
 
				+            ggml_backend_buffer_free(*buffers[i]);
			
 
				+        }
			
 
				+        free(*buffers);
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
			
 
				+
			
 
				+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
			
 
				+        if (t->data == NULL) {
			
 
				+            if (t->view_src == NULL) {
			
 
				+                ggml_tallocr_alloc(&tallocr, t);
			
 
				+            } else if (t->buffer == NULL) {
			
 
				+                ggml_backend_view_init(buffer, t);
			
 
				+            }
			
 
				+        } else {
			
 
				+            if (t->view_src != NULL && t->buffer == NULL) {
			
 
				+                // view of a pre-allocated tensor
			
 
				+                ggml_backend_view_init(buffer, t);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
			
 
				+    (*buffers)[(*n_buffers)++] = buffer;
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
			
 
				+    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
			
 
				+
			
 
				+    size_t alignment = ggml_backend_buft_get_alignment(buft);
			
 
				+    size_t max_size = ggml_backend_buft_get_max_size(buft);
			
 
				+
			
 
				+    ggml_backend_buffer_t * buffers = NULL;
			
 
				+    size_t n_buffers = 0;
			
 
				+
			
 
				+    size_t cur_buf_size = 0;
			
 
				+    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
			
 
				+    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
			
 
				+        size_t this_size = 0;
			
 
				+        if (t->data == NULL && t->view_src == NULL) {
			
 
				+            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
			
 
				+        }
			
 
				+
			
 
				+        if (this_size > max_size) {
			
 
				+            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
			
 
				+                    __func__, t->name,
			
 
				+                    ggml_backend_buft_name(buft),
			
 
				+                    this_size, max_size);
			
 
				+            for (size_t i = 0; i < n_buffers; i++) {
			
 
				+                ggml_backend_buffer_free(buffers[i]);
			
 
				+            }
			
 
				+            free(buffers);
			
 
				+            return NULL;
			
 
				+        }
			
 
				+
			
 
				+        if ((cur_buf_size + this_size) > max_size) {
			
 
				+            // allocate tensors in the current buffer
			
 
				+            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
			
 
				+                return NULL;
			
 
				+            }
			
 
				+            first = t;
			
 
				+            cur_buf_size = this_size;
			
 
				+        } else {
			
 
				+            cur_buf_size += this_size;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // allocate remaining tensors
			
 
				+    if (cur_buf_size > 0) {
			
 
				+        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
			
 
				+            return NULL;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (n_buffers == 0) {
			
 
				+#ifndef NDEBUG
			
 
				+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
			
 
				+#endif
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    ggml_backend_buffer_t buffer;
			
 
				+    if (n_buffers == 1) {
			
 
				+        buffer = buffers[0];
			
 
				+    } else {
			
 
				+        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
			
 
				+    }
			
 
				+    free(buffers);
			
 
				+    return buffer;
			
 
				+}
			
 
				+
			
 
				+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
			
 
				+    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
			
 
				+}
			
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,76 +1,76 @@
 
				-#pragma once

			
 
				-

			
 
				-#include "ggml.h"

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-extern "C" {

			
 
				-#endif

			
 
				-

			
 
				-typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;

			
 
				-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;

			
 
				-typedef struct ggml_backend * ggml_backend_t;

			
 
				-

			
 
				-// Tensor allocator

			
 
				-struct ggml_tallocr {

			
 
				-    ggml_backend_buffer_t buffer;

			
 
				-    void * base;

			
 
				-    size_t alignment;

			
 
				-    size_t offset;

			
 
				-};

			
 
				-

			
 
				-GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);

			
 
				-GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);

			
 
				-

			
 
				-// Graph allocator

			
 
				-/*

			
 
				-  Example usage:

			
 
				-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());

			
 
				-

			
 
				-    // optional: create a worst-case graph and reserve the buffers to avoid reallocations

			
 
				-    ggml_gallocr_reserve(galloc, build_graph(max_batch));

			
 
				-

			
 
				-    // allocate the graph

			
 
				-    struct ggml_cgraph * graph = build_graph(batch);

			
 
				-    ggml_gallocr_alloc_graph(galloc, graph);

			
 
				-

			
 
				-    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));

			
 
				-

			
 
				-    // evaluate the graph

			
 
				-    ggml_backend_graph_compute(backend, graph);

			
 
				-*/

			
 
				-

			
 
				-// special tensor flags for use with the graph allocator:

			
 
				-//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses

			
 
				-//   ggml_set_output(): output tensors are never freed and never overwritten

			
 
				-

			
 
				-typedef struct ggml_gallocr * ggml_gallocr_t;

			
 
				-

			
 
				-GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);

			
 
				-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);

			
 
				-GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);

			
 
				-

			
 
				-// pre-allocate buffers from a measure graph - does not allocate or modify the graph

			
 
				-// call with a worst-case graph to avoid buffer reallocations

			
 
				-// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed

			
 
				-// returns false if the buffer allocation failed

			
 
				-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

			
 
				-GGML_API bool ggml_gallocr_reserve_n(

			
 
				-    ggml_gallocr_t galloc,

			
 
				-    struct ggml_cgraph * graph,

			
 
				-    const int * node_buffer_ids,

			
 
				-    const int * leaf_buffer_ids);

			
 
				-

			
 
				-// automatic reallocation if the topology changes when using a single buffer

			
 
				-// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)

			
 
				-GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

			
 
				-

			
 
				-GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

			
 
				-

			
 
				-// Utils

			
 
				-// Create a buffer and allocate all the tensors in a ggml_context

			
 
				-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);

			
 
				-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-}

			
 
				-#endif

			
 
				+#pragma once
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
			
 
				+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
			
 
				+typedef struct ggml_backend * ggml_backend_t;
			
 
				+
			
 
				+// Tensor allocator
			
 
				+struct ggml_tallocr {
			
 
				+    ggml_backend_buffer_t buffer;
			
 
				+    void * base;
			
 
				+    size_t alignment;
			
 
				+    size_t offset;
			
 
				+};
			
 
				+
			
 
				+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
			
 
				+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
			
 
				+
			
 
				+// Graph allocator
			
 
				+/*
			
 
				+  Example usage:
			
 
				+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
			
 
				+
			
 
				+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
			
 
				+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
			
 
				+
			
 
				+    // allocate the graph
			
 
				+    struct ggml_cgraph * graph = build_graph(batch);
			
 
				+    ggml_gallocr_alloc_graph(galloc, graph);
			
 
				+
			
 
				+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
			
 
				+
			
 
				+    // evaluate the graph
			
 
				+    ggml_backend_graph_compute(backend, graph);
			
 
				+*/
			
 
				+
			
 
				+// special tensor flags for use with the graph allocator:
			
 
				+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
			
 
				+//   ggml_set_output(): output tensors are never freed and never overwritten
			
 
				+
			
 
				+typedef struct ggml_gallocr * ggml_gallocr_t;
			
 
				+
			
 
				+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
			
 
				+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
			
 
				+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
			
 
				+
			
 
				+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
			
 
				+// call with a worst-case graph to avoid buffer reallocations
			
 
				+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
			
 
				+// returns false if the buffer allocation failed
			
 
				+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
			
 
				+GGML_API bool ggml_gallocr_reserve_n(
			
 
				+    ggml_gallocr_t galloc,
			
 
				+    struct ggml_cgraph * graph,
			
 
				+    const int * node_buffer_ids,
			
 
				+    const int * leaf_buffer_ids);
			
 
				+
			
 
				+// automatic reallocation if the topology changes when using a single buffer
			
 
				+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
			
 
				+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
			
 
				+
			
 
				+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
			
 
				+
			
 
				+// Utils
			
 
				+// Create a buffer and allocate all the tensors in a ggml_context
			
 
				+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
			
 
				+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+}
			
 
				+#endif
			
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,141 +1,141 @@
 
				-#pragma once

			
 
				-

			
 
				-// ggml-backend internal header

			
 
				-

			
 
				-#include "ggml-backend.h"

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-extern "C" {

			
 
				-#endif

			
 
				-

			
 
				-    //

			
 
				-    // Backend buffer

			
 
				-    //

			
 
				-

			
 
				-    // buffer type

			
 
				-    typedef void * ggml_backend_buffer_type_context_t;

			
 
				-

			
 
				-    struct ggml_backend_buffer_type_i {

			
 
				-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);

			
 
				-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);

			
 
				-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment

			
 
				-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size

			
 
				-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding

			
 
				-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend

			
 
				-        // check if tensor data is in host memory

			
 
				-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())

			
 
				-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);

			
 
				-    };

			
 
				-

			
 
				-    struct ggml_backend_buffer_type {

			
 
				-        struct ggml_backend_buffer_type_i  iface;

			
 
				-        ggml_backend_buffer_type_context_t context;

			
 
				-    };

			
 
				-

			
 
				-    // buffer

			
 
				-    typedef void * ggml_backend_buffer_context_t;

			
 
				-

			
 
				-    struct ggml_backend_buffer_i {

			
 
				-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);

			
 
				-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);

			
 
				-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);

			
 
				-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

			
 
				-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

			
 
				-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

			
 
				-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer

			
 
				-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);

			
 
				-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras

			
 
				-    };

			
 
				-

			
 
				-    struct ggml_backend_buffer {

			
 
				-        struct ggml_backend_buffer_i  iface;

			
 
				-        ggml_backend_buffer_type_t    buft;

			
 
				-        ggml_backend_buffer_context_t context;

			
 
				-        size_t size;

			
 
				-        enum ggml_backend_buffer_usage usage;

			
 
				-    };

			
 
				-

			
 
				-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(

			
 
				-                   ggml_backend_buffer_type_t      buft,

			
 
				-            struct ggml_backend_buffer_i           iface,

			
 
				-                   ggml_backend_buffer_context_t   context,

			
 
				-                   size_t                          size);

			
 
				-

			
 
				-    // do not use directly, use ggml_backend_tensor_copy instead

			
 
				-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

			
 
				-

			
 
				-    // buffer that contains a collection of buffers

			
 
				-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);

			
 
				-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);

			
 
				-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

			
 
				-

			
 
				-    //

			
 
				-    // Backend

			
 
				-    //

			
 
				-

			
 
				-    typedef void * ggml_backend_context_t;

			
 
				-

			
 
				-    struct ggml_backend_i {

			
 
				-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);

			
 
				-

			
 
				-        void (*GGML_CALL free)(ggml_backend_t backend);

			
 
				-

			
 
				-        // buffer allocation

			
 
				-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);

			
 
				-

			
 
				-        // (optional) asynchronous tensor data access

			
 
				-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

			
 
				-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

			
 
				-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

			
 
				-

			
 
				-        // (optional) complete all pending operations

			
 
				-        void (*GGML_CALL synchronize)(ggml_backend_t backend);

			
 
				-

			
 
				-        // compute graph with a plan (not used currently)

			
 
				-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);

			
 
				-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

			
 
				-

			
 
				-        // compute graph with a plan

			
 
				-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

			
 
				-        // compute graph without a plan (async)

			
 
				-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);

			
 
				-

			
 
				-        // check if the backend supports an operation

			
 
				-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

			
 
				-

			
 
				-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer

			
 
				-        // these should be expensive operations with large batch sizes that may benefit from running on this backend

			
 
				-        // even if the weight has to be copied from the CPU temporarily

			
 
				-        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);

			
 
				-

			
 
				-        // (optional) event synchronization

			
 
				-        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);

			
 
				-        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);

			
 
				-        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);

			
 
				-        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);

			
 
				-        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);

			
 
				-    };

			
 
				-

			
 
				-    struct ggml_backend {

			
 
				-        ggml_guid_t guid;

			
 
				-

			
 
				-        struct ggml_backend_i iface;

			
 
				-        ggml_backend_context_t context;

			
 
				-    };

			
 
				-

			
 
				-    struct ggml_backend_event {

			
 
				-        ggml_backend_t backend;

			
 
				-        void * context;

			
 
				-    };

			
 
				-

			
 
				-    //

			
 
				-    // Backend registry

			
 
				-    //

			
 
				-

			
 
				-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);

			
 
				-

			
 
				-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-}

			
 
				-#endif

			
 
				+#pragma once
			
 
				+
			
 
				+// ggml-backend internal header
			
 
				+
			
 
				+#include "ggml-backend.h"
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+    //
			
 
				+    // Backend buffer
			
 
				+    //
			
 
				+
			
 
				+    // buffer type
			
 
				+    typedef void * ggml_backend_buffer_type_context_t;
			
 
				+
			
 
				+    struct ggml_backend_buffer_type_i {
			
 
				+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
			
 
				+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
			
 
				+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
			
 
				+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
			
 
				+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
			
 
				+        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
			
 
				+        // check if tensor data is in host memory
			
 
				+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
			
 
				+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_backend_buffer_type {
			
 
				+        struct ggml_backend_buffer_type_i  iface;
			
 
				+        ggml_backend_buffer_type_context_t context;
			
 
				+    };
			
 
				+
			
 
				+    // buffer
			
 
				+    typedef void * ggml_backend_buffer_context_t;
			
 
				+
			
 
				+    struct ggml_backend_buffer_i {
			
 
				+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
			
 
				+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
			
 
				+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
			
 
				+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
			
 
				+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
			
 
				+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_backend_buffer {
			
 
				+        struct ggml_backend_buffer_i  iface;
			
 
				+        ggml_backend_buffer_type_t    buft;
			
 
				+        ggml_backend_buffer_context_t context;
			
 
				+        size_t size;
			
 
				+        enum ggml_backend_buffer_usage usage;
			
 
				+    };
			
 
				+
			
 
				+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
			
 
				+                   ggml_backend_buffer_type_t      buft,
			
 
				+            struct ggml_backend_buffer_i           iface,
			
 
				+                   ggml_backend_buffer_context_t   context,
			
 
				+                   size_t                          size);
			
 
				+
			
 
				+    // do not use directly, use ggml_backend_tensor_copy instead
			
 
				+    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
			
 
				+
			
 
				+    // buffer that contains a collection of buffers
			
 
				+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
			
 
				+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
			
 
				+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
			
 
				+
			
 
				+    //
			
 
				+    // Backend
			
 
				+    //
			
 
				+
			
 
				+    typedef void * ggml_backend_context_t;
			
 
				+
			
 
				+    struct ggml_backend_i {
			
 
				+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
			
 
				+
			
 
				+        void (*GGML_CALL free)(ggml_backend_t backend);
			
 
				+
			
 
				+        // buffer allocation
			
 
				+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
			
 
				+
			
 
				+        // (optional) asynchronous tensor data access
			
 
				+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
			
 
				+
			
 
				+        // (optional) complete all pending operations
			
 
				+        void (*GGML_CALL synchronize)(ggml_backend_t backend);
			
 
				+
			
 
				+        // compute graph with a plan (not used currently)
			
 
				+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
			
 
				+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
			
 
				+
			
 
				+        // compute graph with a plan
			
 
				+        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
			
 
				+        // compute graph without a plan (async)
			
 
				+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
			
 
				+
			
 
				+        // check if the backend supports an operation
			
 
				+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
			
 
				+
			
 
				+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
			
 
				+        // these should be expensive operations with large batch sizes that may benefit from running on this backend
			
 
				+        // even if the weight has to be copied from the CPU temporarily
			
 
				+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
			
 
				+
			
 
				+        // (optional) event synchronization
			
 
				+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
			
 
				+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
			
 
				+        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
			
 
				+        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
			
 
				+        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_backend {
			
 
				+        ggml_guid_t guid;
			
 
				+
			
 
				+        struct ggml_backend_i iface;
			
 
				+        ggml_backend_context_t context;
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_backend_event {
			
 
				+        ggml_backend_t backend;
			
 
				+        void * context;
			
 
				+    };
			
 
				+
			
 
				+    //
			
 
				+    // Backend registry
			
 
				+    //
			
 
				+
			
 
				+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
			
 
				+
			
 
				+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+}
			
 
				+#endif
			
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -56,6 +56,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
 
				 }
			
 
				 
			
 
				 // backend buffer
			
 
				+
			
 
				 GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
			
 
				                ggml_backend_buffer_type_t      buft,
			
 
				         struct ggml_backend_buffer_i           iface,
			
@@ -78,10 +79,6 @@ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
 
				     return buffer->iface.get_name(buffer);
			
 
				 }
			
 
				 
			
 
				-#define ggml_assert_aligned(ptr) \
			
 
				-    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
			
 
				-
			
 
				-
			
 
				 void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
			
 
				     if (buffer == NULL) {
			
 
				         return;
			
@@ -90,9 +87,9 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
 
				     if (buffer->iface.free_buffer != NULL) {
			
 
				         buffer->iface.free_buffer(buffer);
			
 
				     }
			
 
				-    
			
 
				-    // TODO: this needs to be freed in cuda and hipblas backends because
			
 
				-    // the cuda backend implementation compiled with msvc
			
 
				+
			
 
				+// TODO: this needs to be freed in cuda and hipblas backends because
			
 
				+// the cuda backend implementation compiled with msvc
			
 
				 #if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
			
 
				     free(buffer);
			
 
				 #endif
			
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,233 +1,233 @@
 
				-#pragma once

			
 
				-

			
 
				-#include "ggml.h"

			
 
				-#include "ggml-alloc.h"

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-extern "C" {

			
 
				-#endif

			
 
				-

			
 
				-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;

			
 
				-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;

			
 
				-    typedef struct ggml_backend_event * ggml_backend_event_t;

			
 
				-    typedef struct ggml_backend * ggml_backend_t;

			
 
				-    typedef void * ggml_backend_graph_plan_t;

			
 
				-

			
 
				-    //

			
 
				-    // Backend buffer

			
 
				-    //

			
 
				-

			
 
				-    // buffer type

			
 
				-    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);

			
 
				-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);

			
 
				-    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);

			
 
				-    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);

			
 
				-    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);

			
 
				-    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);

			
 
				-    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);

			
 
				-

			
 
				-    // buffer

			
 
				-    enum ggml_backend_buffer_usage {

			
 
				-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,

			
 
				-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,

			
 
				-    };

			
 
				-

			
 
				-    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

			
 
				-    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

			
 
				-    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);

			
 
				-    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

			
 
				-    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);

			
 
				-    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);

			
 
				-

			
 
				-    //

			
 
				-    // Backend

			
 
				-    //

			
 
				-

			
 
				-    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);

			
 
				-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);

			
 
				-    GGML_API void         ggml_backend_free(ggml_backend_t backend);

			
 
				-

			
 
				-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);

			
 
				-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);

			
 
				-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);

			
 
				-    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);

			
 
				-

			
 
				-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

			
 
				-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

			
 
				-

			
 
				-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

			
 
				-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

			
 
				-

			
 
				-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);

			
 
				-

			
 
				-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);

			
 
				-    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

			
 
				-

			
 
				-    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

			
 
				-    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);

			
 
				-    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);

			
 
				-    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);

			
 
				-    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);

			
 
				-

			
 
				-    // tensor copy between different backends

			
 
				-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);

			
 
				-

			
 
				-    // asynchronous copy

			
 
				-    // the copy is performed after all the currently queued operations in backend_src

			
 
				-    // backend_dst will wait for the copy to complete before performing other operations

			
 
				-    // automatic fallback to sync copy if async is not supported

			
 
				-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);

			
 
				-

			
 
				-    // events

			
 
				-    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);

			
 
				-    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);

			
 
				-    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);

			
 
				-    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);

			
 
				-    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event

			
 
				-

			
 
				-    //

			
 
				-    // CPU backend

			
 
				-    //

			
 
				-

			
 
				-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);

			
 
				-

			
 
				-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);

			
 
				-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);

			
 
				-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

			
 
				-

			
 
				-    // Create a backend buffer from an existing pointer

			
 
				-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

			
 
				-

			
 
				-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

			
 
				-

			
 
				-#ifdef GGML_USE_CPU_HBM

			
 
				-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);

			
 
				-#endif

			
 
				-

			
 
				-    //

			
 
				-    // Backend registry

			
 
				-    //

			
 
				-

			
 
				-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way

			
 
				-

			
 
				-    GGML_API size_t                     ggml_backend_reg_get_count(void);

			
 
				-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);

			
 
				-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]

			
 
				-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);

			
 
				-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific

			
 
				-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);

			
 
				-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);

			
 
				-

			
 
				-    //

			
 
				-    // Backend scheduler

			
 
				-    //

			
 
				-

			
 
				-    // The backend scheduler allows for multiple backends to be used together

			
 
				-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends

			
 
				-    // The backends are selected based on:

			
 
				-    // - the backend that supports the operation

			
 
				-    // - the location of the pre-allocated tensors (e.g. the weights)

			
 
				-    /*

			
 
				-      Example usage:

			
 
				-

			
 
				-        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned

			
 
				-        // preferrably to run on the same backend as the buffer

			
 
				-        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

			
 
				-

			
 
				-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);

			
 
				-

			
 
				-        // initialize buffers from a max size graph (optional)

			
 
				-        reserve_graph = build_graph(sched, max_batch_size);

			
 
				-

			
 
				-        // manually assign nodes to a backend (optional, should not be needed in most cases)

			
 
				-        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);

			
 
				-        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);

			
 
				-

			
 
				-        ggml_backend_sched_reserve(sched, reserve_graph);

			
 
				-

			
 
				-        // compute

			
 
				-        graph = build_graph(sched);

			
 
				-        ggml_backend_sched_graph_compute(sched, graph);

			
 
				-

			
 
				-        // if there are graph inputs:

			
 
				-        ggml_backend_sched_reset(sched);

			
 
				-        ggml_backend_sched_alloc_graph(sched, graph);

			
 
				-        ggml_backend_tensor_set(input_tensor, ...);

			
 
				-        ggml_backend_sched_graph_compute(sched, graph);

			
 
				-    }

			
 
				-    */

			
 
				-

			
 
				-    struct ggml_backend_sched;

			
 
				-    typedef struct ggml_backend_sched * ggml_backend_sched_t;

			
 
				-

			
 
				-    // when ask == true, the scheduler wants to know if the user wants to observe this node

			
 
				-    // this allows the scheduler to batch nodes together in order to evaluate them in a single call

			
 
				-    //

			
 
				-    // when ask == false, the scheduler is passing the node tensor to the user for observation

			
 
				-    // if the user returns false, the scheduler will cancel the graph compute

			
 
				-    //

			
 
				-    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

			
 
				-

			
 
				-    // Initialize a backend scheduler

			
 
				-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);

			
 
				-    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

			
 
				-

			
 
				-    // Initialize backend buffers from a measure graph

			
 
				-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);

			
 
				-

			
 
				-    // Get the number of splits of the last graph

			
 
				-    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);

			
 
				-    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

			
 
				-

			
 
				-    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

			
 
				-

			
 
				-    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);

			
 
				-    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

			
 
				-

			
 
				-    // Allocate and compute graph on the backend scheduler

			
 
				-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

			
 
				-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

			
 
				-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

			
 
				-    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);

			
 
				-

			
 
				-    // Reset all assignments and allocators - must be called before changing the node backends

			
 
				-    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);

			
 
				-

			
 
				-    // Set a callback to be called for each resulting node during graph compute

			
 
				-    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);

			
 
				-

			
 
				-    //

			
 
				-    // Utils

			
 
				-    //

			
 
				-

			
 
				-    struct ggml_backend_graph_copy {

			
 
				-        ggml_backend_buffer_t buffer;

			
 
				-        struct ggml_context * ctx_allocated;

			
 
				-        struct ggml_context * ctx_unallocated;

			
 
				-        struct ggml_cgraph * graph;

			
 
				-    };

			
 
				-

			
 
				-    // Copy a graph to a different backend

			
 
				-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);

			
 
				-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);

			
 
				-

			
 
				-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);

			
 
				-

			
 
				-    // Compare the output of two backends

			
 
				-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);

			
 
				-

			
 
				-    // Tensor initialization

			
 
				-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);

			
 
				-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

			
 
				-

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-}

			
 
				-#endif

			
 
				+#pragma once
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+#include "ggml-alloc.h"
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
			
 
				+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
			
 
				+    typedef struct ggml_backend_event * ggml_backend_event_t;
			
 
				+    typedef struct ggml_backend * ggml_backend_t;
			
 
				+    typedef void * ggml_backend_graph_plan_t;
			
 
				+
			
 
				+    //
			
 
				+    // Backend buffer
			
 
				+    //
			
 
				+
			
 
				+    // buffer type
			
 
				+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
			
 
				+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
			
 
				+    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
			
 
				+    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
			
 
				+    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
			
 
				+    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
			
 
				+    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
			
 
				+
			
 
				+    // buffer
			
 
				+    enum ggml_backend_buffer_usage {
			
 
				+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
			
 
				+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
			
 
				+    };
			
 
				+
			
 
				+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				+    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				+    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
			
 
				+    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
			
 
				+    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
			
 
				+    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
			
 
				+
			
 
				+    //
			
 
				+    // Backend
			
 
				+    //
			
 
				+
			
 
				+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
			
 
				+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
			
 
				+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
			
 
				+
			
 
				+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
			
 
				+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
			
 
				+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
			
 
				+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
			
 
				+
			
 
				+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				+
			
 
				+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				+
			
 
				+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
			
 
				+
			
 
				+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
			
 
				+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
			
 
				+
			
 
				+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
			
 
				+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
			
 
				+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
			
 
				+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
			
 
				+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
			
 
				+
			
 
				+    // tensor copy between different backends
			
 
				+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
			
 
				+
			
 
				+    // asynchronous copy
			
 
				+    // the copy is performed after all the currently queued operations in backend_src
			
 
				+    // backend_dst will wait for the copy to complete before performing other operations
			
 
				+    // automatic fallback to sync copy if async is not supported
			
 
				+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
			
 
				+
			
 
				+    // events
			
 
				+    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
			
 
				+    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
			
 
				+    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
			
 
				+    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
			
 
				+    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
			
 
				+
			
 
				+    //
			
 
				+    // CPU backend
			
 
				+    //
			
 
				+
			
 
				+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
			
 
				+
			
 
				+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
			
 
				+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
			
 
				+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
			
 
				+
			
 
				+    // Create a backend buffer from an existing pointer
			
 
				+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
			
 
				+
			
 
				+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
			
 
				+
			
 
				+#ifdef GGML_USE_CPU_HBM
			
 
				+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
			
 
				+#endif
			
 
				+
			
 
				+    //
			
 
				+    // Backend registry
			
 
				+    //
			
 
				+
			
 
				+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
			
 
				+
			
 
				+    GGML_API size_t                     ggml_backend_reg_get_count(void);
			
 
				+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
			
 
				+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
			
 
				+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
			
 
				+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
			
 
				+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
			
 
				+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
			
 
				+
			
 
				+    //
			
 
				+    // Backend scheduler
			
 
				+    //
			
 
				+
			
 
				+    // The backend scheduler allows for multiple backends to be used together
			
 
				+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
			
 
				+    // The backends are selected based on:
			
 
				+    // - the backend that supports the operation
			
 
				+    // - the location of the pre-allocated tensors (e.g. the weights)
			
 
				+    /*
			
 
				+      Example usage:
			
 
				+
			
 
				+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
			
 
				+        // preferrably to run on the same backend as the buffer
			
 
				+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
			
 
				+
			
 
				+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
			
 
				+
			
 
				+        // initialize buffers from a max size graph (optional)
			
 
				+        reserve_graph = build_graph(sched, max_batch_size);
			
 
				+
			
 
				+        // manually assign nodes to a backend (optional, should not be needed in most cases)
			
 
				+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
			
 
				+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
			
 
				+
			
 
				+        ggml_backend_sched_reserve(sched, reserve_graph);
			
 
				+
			
 
				+        // compute
			
 
				+        graph = build_graph(sched);
			
 
				+        ggml_backend_sched_graph_compute(sched, graph);
			
 
				+
			
 
				+        // if there are graph inputs:
			
 
				+        ggml_backend_sched_reset(sched);
			
 
				+        ggml_backend_sched_alloc_graph(sched, graph);
			
 
				+        ggml_backend_tensor_set(input_tensor, ...);
			
 
				+        ggml_backend_sched_graph_compute(sched, graph);
			
 
				+    }
			
 
				+    */
			
 
				+
			
 
				+    struct ggml_backend_sched;
			
 
				+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
			
 
				+
			
 
				+    // when ask == true, the scheduler wants to know if the user wants to observe this node
			
 
				+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
			
 
				+    //
			
 
				+    // when ask == false, the scheduler is passing the node tensor to the user for observation
			
 
				+    // if the user returns false, the scheduler will cancel the graph compute
			
 
				+    //
			
 
				+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
			
 
				+
			
 
				+    // Initialize a backend scheduler
			
 
				+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
			
 
				+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
			
 
				+
			
 
				+    // Initialize backend buffers from a measure graph
			
 
				+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
			
 
				+
			
 
				+    // Get the number of splits of the last graph
			
 
				+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
			
 
				+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
			
 
				+
			
 
				+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
			
 
				+
			
 
				+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
			
 
				+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
			
 
				+
			
 
				+    // Allocate and compute graph on the backend scheduler
			
 
				+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
			
 
				+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
			
 
				+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
			
 
				+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
			
 
				+
			
 
				+    // Reset all assignments and allocators - must be called before changing the node backends
			
 
				+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
			
 
				+
			
 
				+    // Set a callback to be called for each resulting node during graph compute
			
 
				+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
			
 
				+
			
 
				+    //
			
 
				+    // Utils
			
 
				+    //
			
 
				+
			
 
				+    struct ggml_backend_graph_copy {
			
 
				+        ggml_backend_buffer_t buffer;
			
 
				+        struct ggml_context * ctx_allocated;
			
 
				+        struct ggml_context * ctx_unallocated;
			
 
				+        struct ggml_cgraph * graph;
			
 
				+    };
			
 
				+
			
 
				+    // Copy a graph to a different backend
			
 
				+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
			
 
				+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
			
 
				+
			
 
				+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
			
 
				+
			
 
				+    // Compare the output of two backends
			
 
				+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
			
 
				+
			
 
				+    // Tensor initialization
			
 
				+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
			
 
				+    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				+
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+}
			
 
				+#endif
			
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,1853 +1,1853 @@
 
				-#ifndef GGML_COMMON_DECL

			
 
				-

			
 
				-#if defined(GGML_COMMON_DECL_C)

			
 
				-#include <stdint.h>

			
 
				-

			
 
				-typedef uint16_t ggml_half;

			
 
				-typedef uint32_t ggml_half2;

			
 
				-

			
 
				-#define GGML_COMMON_AGGR

			
 
				-

			
 
				-#define GGML_COMMON_DECL

			
 
				-#elif defined(GGML_COMMON_DECL_METAL)

			
 
				-#include <metal_stdlib>

			
 
				-

			
 
				-typedef half  ggml_half;

			
 
				-typedef half2 ggml_half2;

			
 
				-

			
 
				-#define GGML_COMMON_AGGR

			
 
				-

			
 
				-#define GGML_COMMON_DECL

			
 
				-#elif defined(GGML_COMMON_DECL_CUDA)

			
 
				-#include <cuda_fp16.h>

			
 
				-#include <cstdint>

			
 
				-

			
 
				-typedef half  ggml_half;

			
 
				-typedef half2 ggml_half2;

			
 
				-

			
 
				-#define GGML_COMMON_AGGR data

			
 
				-

			
 
				-#define GGML_COMMON_DECL

			
 
				-#elif defined(GGML_COMMON_DECL_HIP)

			
 
				-#include <hip/hip_fp16.h>

			
 
				-#include <cstdint>

			
 
				-

			
 
				-typedef half  ggml_half;

			
 
				-typedef half2 ggml_half2;

			
 
				-

			
 
				-#define GGML_COMMON_AGGR data

			
 
				-

			
 
				-#define GGML_COMMON_DECL

			
 
				-#elif defined(GGML_COMMON_DECL_SYCL)

			
 
				-#include <sycl/half_type.hpp>

			
 
				-#include <cstdint>

			
 
				-

			
 
				-typedef sycl::half  ggml_half;

			
 
				-typedef sycl::half2 ggml_half2;

			
 
				-

			
 
				-#define GGML_COMMON_AGGR data

			
 
				-

			
 
				-#define GGML_COMMON_DECL

			
 
				-#endif

			
 
				-

			
 
				-#if defined(GGML_COMMON_DECL)

			
 
				-

			
 
				-#ifndef __cplusplus

			
 
				-#ifndef static_assert

			
 
				-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)

			
 
				-#define static_assert(cond, msg) _Static_assert(cond, msg)

			
 
				-#else

			
 
				-#define static_assert(cond, msg) struct global_scope_noop_trick

			
 
				-#endif

			
 
				-#endif

			
 
				-#endif // __cplusplus

			
 
				-

			
 
				-// QK = number of values after dequantization

			
 
				-// QK_K = super-block size

			
 
				-

			
 
				-#ifdef GGML_QKK_64

			
 
				-#define QK_K 64

			
 
				-#define K_SCALE_SIZE 4

			
 
				-#else

			
 
				-#define QK_K 256

			
 
				-#define K_SCALE_SIZE 12

			
 
				-#endif // GGML_QKK_64

			
 
				-

			
 
				-#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)

			
 
				-// QR = QK / number of values before dequantization

			
 
				-// QI = number of 32 bit integers before dequantization

			
 
				-

			
 
				-#define QI4_0 (QK4_0 / (4 * QR4_0))

			
 
				-#define QR4_0 2

			
 
				-

			
 
				-#define QI4_1 (QK4_1 / (4 * QR4_1))

			
 
				-#define QR4_1 2

			
 
				-

			
 
				-#define QI5_0 (QK5_0 / (4 * QR5_0))

			
 
				-#define QR5_0 2

			
 
				-

			
 
				-#define QI5_1 (QK5_1 / (4 * QR5_1))

			
 
				-#define QR5_1 2

			
 
				-

			
 
				-#define QI8_0 (QK8_0 / (4 * QR8_0))

			
 
				-#define QR8_0 1

			
 
				-

			
 
				-#define QI8_1 (QK8_1 / (4 * QR8_1))

			
 
				-#define QR8_1 1

			
 
				-

			
 
				-#define QI2_K (QK_K / (4*QR2_K))

			
 
				-#define QR2_K 4

			
 
				-

			
 
				-#define QI3_K (QK_K / (4*QR3_K))

			
 
				-#define QR3_K 4

			
 
				-

			
 
				-#define QI4_K (QK_K / (4*QR4_K))

			
 
				-#define QR4_K 2

			
 
				-

			
 
				-#define QI5_K (QK_K / (4*QR5_K))

			
 
				-#define QR5_K 2

			
 
				-

			
 
				-#define QI6_K (QK_K / (4*QR6_K))

			
 
				-#define QR6_K 2

			
 
				-

			
 
				-#define QI2_XXS (QK_K / (4*QR2_XXS))

			
 
				-#define QR2_XXS 8

			
 
				-

			
 
				-#define QI2_XS (QK_K / (4*QR2_XS))

			
 
				-#define QR2_XS 8

			
 
				-

			
 
				-#define QI2_S (QK_K / (4*QR2_S))

			
 
				-#define QR2_S 8

			
 
				-

			
 
				-#define QI3_XXS (QK_K / (4*QR3_XXS))

			
 
				-#define QR3_XXS 8

			
 
				-

			
 
				-#define QI3_XS (QK_K / (4*QR3_XS))

			
 
				-#define QR3_XS 8

			
 
				-

			
 
				-#define QI1_S (QK_K / (4*QR1_S))

			
 
				-#define QR1_S 8

			
 
				-

			
 
				-#define QI4_NL (QK4_NL / (4*QR4_NL))

			
 
				-#define QR4_NL 2

			
 
				-

			
 
				-#if QK_K == 64

			
 
				-#define QI4_XS QI4_NL

			
 
				-#define QR4_XS QR4_NL

			
 
				-#else

			
 
				-#define QI4_XS (QK_K / (4*QR4_XS))

			
 
				-#define QR4_XS 8

			
 
				-#endif

			
 
				-

			
 
				-#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

			
 
				-

			
 
				-#define QK4_0 32

			
 
				-typedef struct {

			
 
				-    ggml_half d;           // delta

			
 
				-    uint8_t qs[QK4_0 / 2]; // nibbles / quants

			
 
				-} block_q4_0;

			
 
				-static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 block size/padding");

			
 
				-

			
 
				-#define QK4_1 32

			
 
				-typedef struct {

			
 
				-    union {

			
 
				-        struct {

			
 
				-            ggml_half d; // delta

			
 
				-            ggml_half m; // min

			
 
				-        } GGML_COMMON_AGGR;

			
 
				-        ggml_half2 dm;

			
 
				-    };

			
 
				-    uint8_t qs[QK4_1 / 2]; // nibbles / quants

			
 
				-} block_q4_1;

			
 
				-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");

			
 
				-

			
 
				-#define QK5_0 32

			
 
				-typedef struct {

			
 
				-    ggml_half d;           // delta

			
 
				-    uint8_t qh[4];         // 5-th bit of quants

			
 
				-    uint8_t qs[QK5_0 / 2]; // nibbles / quants

			
 
				-} block_q5_0;

			
 
				-static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");

			
 
				-

			
 
				-#define QK5_1 32

			
 
				-typedef struct {

			
 
				-    union {

			
 
				-        struct {

			
 
				-            ggml_half d; // delta

			
 
				-            ggml_half m; // min

			
 
				-        } GGML_COMMON_AGGR;

			
 
				-        ggml_half2 dm;

			
 
				-    };

			
 
				-    uint8_t qh[4];         // 5-th bit of quants

			
 
				-    uint8_t qs[QK5_1 / 2]; // nibbles / quants

			
 
				-} block_q5_1;

			
 
				-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

			
 
				-

			
 
				-#define QK8_0 32

			
 
				-typedef struct {

			
 
				-    ggml_half d;       // delta

			
 
				-    int8_t  qs[QK8_0]; // quants

			
 
				-} block_q8_0;

			
 
				-static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block size/padding");

			
 
				-

			
 
				-#define QK8_1 32

			
 
				-typedef struct {

			
 
				-    union {

			
 
				-        struct {

			
 
				-            ggml_half d; // delta

			
 
				-            ggml_half s; // d * sum(qs[i])

			
 
				-        } GGML_COMMON_AGGR;

			
 
				-        ggml_half2 ds;

			
 
				-    };

			
 
				-    int8_t qs[QK8_1]; // quants

			
 
				-} block_q8_1;

			
 
				-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");

			
 
				-

			
 
				-//

			
 
				-// Super-block quantization structures

			
 
				-//

			
 
				-

			
 
				-// 2-bit quantization

			
 
				-// weight is represented as x = a * q + b

			
 
				-// 16 blocks of 16 elements each

			
 
				-// Effectively 2.625 bits per weight

			
 
				-typedef struct {

			
 
				-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits

			
 
				-    uint8_t qs[QK_K/4];      // quants

			
 
				-    union {

			
 
				-        struct {

			
 
				-            ggml_half d;    // super-block scale for quantized scales

			
 
				-            ggml_half dmin; // super-block scale for quantized mins

			
 
				-        } GGML_COMMON_AGGR;

			
 
				-        ggml_half2 dm;

			
 
				-    };

			
 
				-} block_q2_K;

			
 
				-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

			
 
				-

			
 
				-// 3-bit quantization

			
 
				-// weight is represented as x = a * q

			
 
				-// 16 blocks of 16 elements each

			
 
				-// Effectively 3.4375 bits per weight

			
 
				-#ifdef GGML_QKK_64

			
 
				-typedef struct {

			
 
				-    uint8_t hmask[QK_K/8]; // quants - high bit

			
 
				-    uint8_t qs[QK_K/4];    // quants - low 2 bits

			
 
				-    uint8_t scales[2];

			
 
				-    ggml_half d;           // super-block scale

			
 
				-} block_q3_K;

			
 
				-static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");

			
 
				-#else

			
 
				-typedef struct {

			
 
				-    uint8_t hmask[QK_K/8]; // quants - high bit

			
 
				-    uint8_t qs[QK_K/4];    // quants - low 2 bits

			
 
				-    uint8_t scales[12];    // scales, quantized with 6 bits

			
 
				-    ggml_half d;           // super-block scale

			
 
				-} block_q3_K;

			
 
				-static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");

			
 
				-#endif

			
 
				-

			
 
				-// 4-bit quantization

			
 
				-// 8 blocks of 32 elements each

			
 
				-// weight is represented as x = a * q + b

			
 
				-// Effectively 4.5 bits per weight

			
 
				-#ifdef GGML_QKK_64

			
 
				-typedef struct {

			
 
				-    ggml_half d[2];     // super-block scales/mins

			
 
				-    uint8_t scales[2];  // 4-bit block scales/mins

			
 
				-    uint8_t qs[QK_K/2]; // 4--bit quants

			
 
				-} block_q4_K;

			
 
				-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");

			
 
				-#else

			
 
				-typedef struct {

			
 
				-    union {

			
 
				-        struct {

			
 
				-            ggml_half d;    // super-block scale for quantized scales

			
 
				-            ggml_half dmin; // super-block scale for quantized mins

			
 
				-        } GGML_COMMON_AGGR;

			
 
				-        ggml_half2 dm;

			
 
				-    };

			
 
				-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits

			
 
				-    uint8_t qs[QK_K/2];           // 4--bit quants

			
 
				-} block_q4_K;

			
 
				-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");

			
 
				-#endif

			
 
				-

			
 
				-// 5-bit quantization

			
 
				-// 8 blocks of 32 elements each

			
 
				-// weight is represented as x = a * q + b

			
 
				-// Effectively 5.5 bits per weight

			
 
				-#ifdef GGML_QKK_64

			
 
				-typedef struct {

			
 
				-    ggml_half d;             // super-block scale

			
 
				-    int8_t  scales[QK_K/16]; // 8-bit block scales

			
 
				-    uint8_t qh[QK_K/8];      // quants, high bit

			
 
				-    uint8_t qs[QK_K/2];      // quants, low 4 bits

			
 
				-} block_q5_K;

			
 
				-static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");

			
 
				-#else

			
 
				-typedef struct {

			
 
				-    union {

			
 
				-        struct {

			
 
				-            ggml_half d;    // super-block scale for quantized scales

			
 
				-            ggml_half dmin; // super-block scale for quantized mins

			
 
				-        } GGML_COMMON_AGGR;

			
 
				-        ggml_half2 dm;

			
 
				-    };

			
 
				-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits

			
 
				-    uint8_t qh[QK_K/8];           // quants, high bit

			
 
				-    uint8_t qs[QK_K/2];           // quants, low 4 bits

			
 
				-} block_q5_K;

			
 
				-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");

			
 
				-#endif

			
 
				-

			
 
				-// 6-bit quantization

			
 
				-// weight is represented as x = a * q

			
 
				-// 16 blocks of 16 elements each

			
 
				-// Effectively 6.5625 bits per weight

			
 
				-typedef struct {

			
 
				-    uint8_t ql[QK_K/2];      // quants, lower 4 bits

			
 
				-    uint8_t qh[QK_K/4];      // quants, upper 2 bits

			
 
				-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits

			
 
				-    ggml_half d;             // super-block scale

			
 
				-} block_q6_K;

			
 
				-static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

			
 
				-

			
 
				-// This is only used for intermediate quantization and dot products

			
 
				-typedef struct {

			
 
				-    float   d;              // delta

			
 
				-    int8_t  qs[QK_K];       // quants

			
 
				-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16

			
 
				-} block_q8_K;

			
 
				-static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

			
 
				-

			
 
				-// (Almost) "true" 2-bit quantization.

			
 
				-// Due to the need to use blocks as per ggml design, it ends up using

			
 
				-// 2.0625 bpw because of the 16-bit scale for each block of 256.

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint16_t qs[QK_K/8];

			
 
				-} block_iq2_xxs;

			
 
				-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");

			
 
				-

			
 
				-// 2.3125 bpw quants

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint16_t qs[QK_K/8];

			
 
				-    uint8_t  scales[QK_K/32];

			
 
				-} block_iq2_xs;

			
 
				-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");

			
 
				-

			
 
				-// 2.5625 bpw quants

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint8_t qs[QK_K/4];

			
 
				-    uint8_t qh[QK_K/32];

			
 
				-    uint8_t scales[QK_K/32];

			
 
				-} block_iq2_s;

			
 
				-static_assert(sizeof(block_iq2_s) == sizeof(ggml_half) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");

			
 
				-

			
 
				-// (Almost) "true" 3-bit quantization.

			
 
				-// Due to the need to use blocks as per ggml design, it ends up using

			
 
				-// 3.0625 bpw because of the 16-bit scale for each block of 256.

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint8_t qs[3*QK_K/8];

			
 
				-} block_iq3_xxs;

			
 
				-static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");

			
 
				-

			
 
				-// 3.4375 bpw

			
 
				-#if QK_K == 64

			
 
				-#define IQ3S_N_SCALE 2

			
 
				-#else

			
 
				-#define IQ3S_N_SCALE QK_K/64

			
 
				-#endif

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint8_t qs[QK_K/4];

			
 
				-    uint8_t qh[QK_K/32];

			
 
				-    uint8_t signs[QK_K/8];

			
 
				-    uint8_t scales[IQ3S_N_SCALE];

			
 
				-} block_iq3_s;

			
 
				-static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");

			
 
				-

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint8_t  qs[QK_K/8];

			
 
				-    uint16_t qh[QK_K/32];

			
 
				-} block_iq1_s;

			
 
				-static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");

			
 
				-

			
 
				-// 1.75 bpw

			
 
				-typedef struct {

			
 
				-    uint8_t  qs[QK_K/8];      // grid index, low 8 bits

			
 
				-    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)

			
 
				-#if QK_K == 64

			
 
				-    ggml_half d;

			
 
				-#endif

			
 
				-    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)

			
 
				-} block_iq1_m;

			
 
				-#if QK_K == 64

			
 
				-static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");

			
 
				-#else

			
 
				-static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");

			
 
				-#endif

			
 
				-

			
 
				-// Used by IQ1_M quants

			
 
				-typedef union {

			
 
				-    ggml_half f16;

			
 
				-    uint16_t  u16;

			
 
				-} iq1m_scale_t;

			
 
				-

			
 
				-// Non-linear quants

			
 
				-#define QK4_NL 32

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint8_t qs[QK4_NL/2];

			
 
				-} block_iq4_nl;

			
 
				-static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");

			
 
				-

			
 
				-#if QK_K == 64

			
 
				-#define block_iq4_xs block_iq4_nl

			
 
				-#else

			
 
				-typedef struct {

			
 
				-    ggml_half d;

			
 
				-    uint16_t scales_h;

			
 
				-    uint8_t  scales_l[QK_K/64];

			
 
				-    uint8_t  qs[QK_K/2];

			
 
				-} block_iq4_xs;

			
 
				-static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");

			
 
				-#endif

			
 
				-

			
 
				-#endif // GGML_COMMON_DECL

			
 
				-#endif // GGML_COMMON_DECL

			
 
				-

			
 
				-////////////////////////////////////////////////////////////////////////////////

			
 
				-

			
 
				-#ifndef GGML_COMMON_IMPL

			
 
				-

			
 
				-#if defined(GGML_COMMON_IMPL_C)

			
 
				-#include <stdint.h>

			
 
				-

			
 
				-#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {

			
 
				-#define GGML_TABLE_END() };

			
 
				-

			
 
				-#define GGML_COMMON_IMPL

			
 
				-#elif defined(GGML_COMMON_IMPL_METAL)

			
 
				-#include <metal_stdlib>

			
 
				-

			
 
				-#define GGML_TABLE_BEGIN(type, name, size) static const constant type name[size] = {

			
 
				-#define GGML_TABLE_END() };

			
 
				-

			
 
				-#define GGML_COMMON_IMPL

			
 
				-#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)

			
 
				-#include <cstdint>

			
 
				-

			
 
				-#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {

			
 
				-#define GGML_TABLE_END() };

			
 
				-

			
 
				-#define GGML_COMMON_IMPL

			
 
				-#elif defined(GGML_COMMON_IMPL_SYCL)

			
 
				-

			
 
				-#include <cstdint>

			
 
				-

			
 
				-#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {

			
 
				-#define GGML_TABLE_END() };

			
 
				-

			
 
				-#define GGML_COMMON_IMPL

			
 
				-#endif

			
 
				-

			
 
				-#if defined(GGML_COMMON_IMPL)

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint8_t, kmask_iq2xs, 8)

			
 
				-    1, 2, 4, 8, 16, 32, 64, 128

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)

			
 
				-      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,

			
 
				-    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,

			
 
				-    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,

			
 
				-     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,

			
 
				-    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,

			
 
				-     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,

			
 
				-     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,

			
 
				-    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)

			
 
				-    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,

			
 
				-    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,

			
 
				-    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,

			
 
				-    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,

			
 
				-    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,

			
 
				-    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,

			
 
				-    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,

			
 
				-    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,

			
 
				-    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,

			
 
				-    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,

			
 
				-    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,

			
 
				-    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,

			
 
				-    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,

			
 
				-    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,

			
 
				-    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,

			
 
				-    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,

			
 
				-    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,

			
 
				-    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,

			
 
				-    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,

			
 
				-    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,

			
 
				-    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,

			
 
				-    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,

			
 
				-    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,

			
 
				-    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,

			
 
				-    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,

			
 
				-    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,

			
 
				-    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,

			
 
				-    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,

			
 
				-    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,

			
 
				-    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,

			
 
				-    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,

			
 
				-    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,

			
 
				-GGML_TABLE_END()

			
 
				-//#endif

			
 
				-

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)

			
 
				-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,

			
 
				-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,

			
 
				-    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,

			
 
				-    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,

			
 
				-    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,

			
 
				-    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,

			
 
				-    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,

			
 
				-    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,

			
 
				-    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,

			
 
				-    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,

			
 
				-    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,

			
 
				-    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,

			
 
				-    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,

			
 
				-    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,

			
 
				-    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,

			
 
				-    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,

			
 
				-    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,

			
 
				-    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,

			
 
				-    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,

			
 
				-    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,

			
 
				-    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,

			
 
				-    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,

			
 
				-    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,

			
 
				-    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,

			
 
				-    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,

			
 
				-    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,

			
 
				-    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,

			
 
				-    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,

			
 
				-    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,

			
 
				-    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,

			
 
				-    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,

			
 
				-    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,

			
 
				-    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,

			
 
				-    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,

			
 
				-    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,

			
 
				-    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,

			
 
				-    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,

			
 
				-    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,

			
 
				-    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,

			
 
				-    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,

			
 
				-    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,

			
 
				-    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,

			
 
				-    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,

			
 
				-    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,

			
 
				-    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,

			
 
				-    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,

			
 
				-    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,

			
 
				-    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,

			
 
				-    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,

			
 
				-    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,

			
 
				-    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,

			
 
				-    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,

			
 
				-    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,

			
 
				-    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,

			
 
				-    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,

			
 
				-    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,

			
 
				-    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,

			
 
				-    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,

			
 
				-    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,

			
 
				-    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,

			
 
				-    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,

			
 
				-    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,

			
 
				-    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,

			
 
				-    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint64_t, iq2xs_grid, 512)

			
 
				-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,

			
 
				-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,

			
 
				-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,

			
 
				-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,

			
 
				-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,

			
 
				-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,

			
 
				-    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,

			
 
				-    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,

			
 
				-    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,

			
 
				-    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,

			
 
				-    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,

			
 
				-    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,

			
 
				-    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,

			
 
				-    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,

			
 
				-    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,

			
 
				-    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,

			
 
				-    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,

			
 
				-    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,

			
 
				-    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,

			
 
				-    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,

			
 
				-    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,

			
 
				-    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,

			
 
				-    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,

			
 
				-    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,

			
 
				-    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,

			
 
				-    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,

			
 
				-    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,

			
 
				-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,

			
 
				-    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,

			
 
				-    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,

			
 
				-    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,

			
 
				-    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,

			
 
				-    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,

			
 
				-    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,

			
 
				-    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,

			
 
				-    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,

			
 
				-    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,

			
 
				-    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,

			
 
				-    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,

			
 
				-    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,

			
 
				-    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,

			
 
				-    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,

			
 
				-    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,

			
 
				-    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,

			
 
				-    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,

			
 
				-    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,

			
 
				-    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,

			
 
				-    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,

			
 
				-    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,

			
 
				-    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,

			
 
				-    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,

			
 
				-    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,

			
 
				-    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,

			
 
				-    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,

			
 
				-    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,

			
 
				-    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,

			
 
				-    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,

			
 
				-    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,

			
 
				-    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,

			
 
				-    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,

			
 
				-    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,

			
 
				-    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,

			
 
				-    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,

			
 
				-    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,

			
 
				-    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,

			
 
				-    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,

			
 
				-    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,

			
 
				-    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,

			
 
				-    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,

			
 
				-    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,

			
 
				-    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,

			
 
				-    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,

			
 
				-    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,

			
 
				-    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,

			
 
				-    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,

			
 
				-    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,

			
 
				-    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,

			
 
				-    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,

			
 
				-    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,

			
 
				-    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,

			
 
				-    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,

			
 
				-    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,

			
 
				-    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,

			
 
				-    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,

			
 
				-    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,

			
 
				-    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,

			
 
				-    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,

			
 
				-    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,

			
 
				-    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,

			
 
				-    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,

			
 
				-    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,

			
 
				-    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,

			
 
				-    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,

			
 
				-    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,

			
 
				-    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,

			
 
				-    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,

			
 
				-    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,

			
 
				-    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,

			
 
				-    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,

			
 
				-    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,

			
 
				-    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,

			
 
				-    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,

			
 
				-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,

			
 
				-    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,

			
 
				-    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,

			
 
				-    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,

			
 
				-    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,

			
 
				-    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,

			
 
				-    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,

			
 
				-    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,

			
 
				-    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,

			
 
				-    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,

			
 
				-    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,

			
 
				-    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,

			
 
				-    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,

			
 
				-    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,

			
 
				-    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,

			
 
				-    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,

			
 
				-    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,

			
 
				-    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,

			
 
				-    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,

			
 
				-    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,

			
 
				-    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,

			
 
				-    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,

			
 
				-    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,

			
 
				-    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,

			
 
				-    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,

			
 
				-    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint64_t, iq2s_grid, 1024)

			
 
				-    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,

			
 
				-    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,

			
 
				-    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,

			
 
				-    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,

			
 
				-    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,

			
 
				-    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,

			
 
				-    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,

			
 
				-    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,

			
 
				-    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,

			
 
				-    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,

			
 
				-    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,

			
 
				-    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,

			
 
				-    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,

			
 
				-    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,

			
 
				-    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,

			
 
				-    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,

			
 
				-    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,

			
 
				-    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,

			
 
				-    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,

			
 
				-    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,

			
 
				-    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,

			
 
				-    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,

			
 
				-    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,

			
 
				-    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,

			
 
				-    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,

			
 
				-    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,

			
 
				-    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,

			
 
				-    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,

			
 
				-    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,

			
 
				-    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,

			
 
				-    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,

			
 
				-    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,

			
 
				-    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,

			
 
				-    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,

			
 
				-    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,

			
 
				-    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,

			
 
				-    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,

			
 
				-    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,

			
 
				-    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,

			
 
				-    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,

			
 
				-    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,

			
 
				-    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,

			
 
				-    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,

			
 
				-    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,

			
 
				-    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,

			
 
				-    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,

			
 
				-    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,

			
 
				-    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,

			
 
				-    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,

			
 
				-    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,

			
 
				-    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,

			
 
				-    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,

			
 
				-    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,

			
 
				-    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,

			
 
				-    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,

			
 
				-    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,

			
 
				-    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,

			
 
				-    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,

			
 
				-    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,

			
 
				-    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,

			
 
				-    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,

			
 
				-    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,

			
 
				-    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,

			
 
				-    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,

			
 
				-    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,

			
 
				-    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,

			
 
				-    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,

			
 
				-    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,

			
 
				-    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,

			
 
				-    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,

			
 
				-    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,

			
 
				-    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,

			
 
				-    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,

			
 
				-    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,

			
 
				-    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,

			
 
				-    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,

			
 
				-    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,

			
 
				-    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,

			
 
				-    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,

			
 
				-    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,

			
 
				-    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,

			
 
				-    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,

			
 
				-    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,

			
 
				-    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,

			
 
				-    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,

			
 
				-    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,

			
 
				-    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,

			
 
				-    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,

			
 
				-    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,

			
 
				-    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,

			
 
				-    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,

			
 
				-    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,

			
 
				-    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,

			
 
				-    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,

			
 
				-    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,

			
 
				-    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,

			
 
				-    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,

			
 
				-    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,

			
 
				-    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,

			
 
				-    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,

			
 
				-    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,

			
 
				-    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,

			
 
				-    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,

			
 
				-    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,

			
 
				-    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,

			
 
				-    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,

			
 
				-    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,

			
 
				-    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,

			
 
				-    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,

			
 
				-    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,

			
 
				-    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,

			
 
				-    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,

			
 
				-    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,

			
 
				-    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,

			
 
				-    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,

			
 
				-    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,

			
 
				-    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,

			
 
				-    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,

			
 
				-    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,

			
 
				-    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,

			
 
				-    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,

			
 
				-    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,

			
 
				-    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,

			
 
				-    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,

			
 
				-    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,

			
 
				-    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,

			
 
				-    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,

			
 
				-    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,

			
 
				-    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,

			
 
				-    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,

			
 
				-    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,

			
 
				-    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,

			
 
				-    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,

			
 
				-    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,

			
 
				-    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,

			
 
				-    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,

			
 
				-    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,

			
 
				-    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,

			
 
				-    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,

			
 
				-    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,

			
 
				-    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,

			
 
				-    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,

			
 
				-    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,

			
 
				-    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,

			
 
				-    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,

			
 
				-    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,

			
 
				-    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,

			
 
				-    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,

			
 
				-    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,

			
 
				-    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,

			
 
				-    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,

			
 
				-    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,

			
 
				-    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,

			
 
				-    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,

			
 
				-    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,

			
 
				-    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,

			
 
				-    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,

			
 
				-    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,

			
 
				-    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,

			
 
				-    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,

			
 
				-    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,

			
 
				-    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,

			
 
				-    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,

			
 
				-    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,

			
 
				-    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,

			
 
				-    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,

			
 
				-    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,

			
 
				-    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,

			
 
				-    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,

			
 
				-    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,

			
 
				-    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,

			
 
				-    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,

			
 
				-    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,

			
 
				-    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,

			
 
				-    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,

			
 
				-    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,

			
 
				-    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,

			
 
				-    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,

			
 
				-    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,

			
 
				-    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,

			
 
				-    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,

			
 
				-    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,

			
 
				-    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,

			
 
				-    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,

			
 
				-    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,

			
 
				-    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,

			
 
				-    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,

			
 
				-    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,

			
 
				-    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,

			
 
				-    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,

			
 
				-    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,

			
 
				-    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,

			
 
				-    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,

			
 
				-    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,

			
 
				-    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,

			
 
				-    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,

			
 
				-    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,

			
 
				-    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,

			
 
				-    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,

			
 
				-    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,

			
 
				-    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,

			
 
				-    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,

			
 
				-    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,

			
 
				-    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,

			
 
				-    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,

			
 
				-    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,

			
 
				-    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,

			
 
				-    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,

			
 
				-    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,

			
 
				-    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,

			
 
				-    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,

			
 
				-    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,

			
 
				-    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,

			
 
				-    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,

			
 
				-    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,

			
 
				-    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,

			
 
				-    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,

			
 
				-    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,

			
 
				-    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,

			
 
				-    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,

			
 
				-    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,

			
 
				-    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,

			
 
				-    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,

			
 
				-    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,

			
 
				-    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,

			
 
				-    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,

			
 
				-    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,

			
 
				-    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,

			
 
				-    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,

			
 
				-    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,

			
 
				-    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,

			
 
				-    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,

			
 
				-    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,

			
 
				-    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,

			
 
				-    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,

			
 
				-    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,

			
 
				-    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,

			
 
				-    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,

			
 
				-    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,

			
 
				-    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,

			
 
				-    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,

			
 
				-    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,

			
 
				-    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,

			
 
				-    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,

			
 
				-    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,

			
 
				-    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,

			
 
				-    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,

			
 
				-    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,

			
 
				-    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,

			
 
				-    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,

			
 
				-    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,

			
 
				-    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,

			
 
				-    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,

			
 
				-    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,

			
 
				-    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,

			
 
				-    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint32_t, iq3xxs_grid, 256)

			
 
				-    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,

			
 
				-    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,

			
 
				-    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,

			
 
				-    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,

			
 
				-    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,

			
 
				-    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,

			
 
				-    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,

			
 
				-    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,

			
 
				-    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,

			
 
				-    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,

			
 
				-    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,

			
 
				-    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,

			
 
				-    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,

			
 
				-    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,

			
 
				-    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,

			
 
				-    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,

			
 
				-    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,

			
 
				-    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,

			
 
				-    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,

			
 
				-    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,

			
 
				-    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,

			
 
				-    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,

			
 
				-    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,

			
 
				-    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,

			
 
				-    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,

			
 
				-    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,

			
 
				-    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,

			
 
				-    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,

			
 
				-    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,

			
 
				-    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,

			
 
				-    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,

			
 
				-    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)

			
 
				-    0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,

			
 
				-    0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,

			
 
				-    0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,

			
 
				-    0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,

			
 
				-    0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,

			
 
				-    0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,

			
 
				-    0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,

			
 
				-    0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,

			
 
				-    0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,

			
 
				-    0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,

			
 
				-    0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,

			
 
				-    0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,

			
 
				-    0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,

			
 
				-    0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,

			
 
				-    0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,

			
 
				-    0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,

			
 
				-    0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,

			
 
				-    0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,

			
 
				-    0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,

			
 
				-    0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,

			
 
				-    0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,

			
 
				-    0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,

			
 
				-    0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,

			
 
				-    0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,

			
 
				-    0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,

			
 
				-    0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,

			
 
				-    0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,

			
 
				-    0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,

			
 
				-    0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,

			
 
				-    0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,

			
 
				-    0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,

			
 
				-    0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,

			
 
				-    0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,

			
 
				-    0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,

			
 
				-    0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,

			
 
				-    0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,

			
 
				-    0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,

			
 
				-    0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,

			
 
				-    0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,

			
 
				-    0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,

			
 
				-    0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,

			
 
				-    0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,

			
 
				-    0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,

			
 
				-    0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,

			
 
				-    0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,

			
 
				-    0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,

			
 
				-    0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,

			
 
				-    0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,

			
 
				-    0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,

			
 
				-    0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,

			
 
				-    0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,

			
 
				-    0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,

			
 
				-    0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,

			
 
				-    0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,

			
 
				-    0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,

			
 
				-    0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,

			
 
				-    0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,

			
 
				-    0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,

			
 
				-    0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,

			
 
				-    0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,

			
 
				-    0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,

			
 
				-    0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,

			
 
				-    0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,

			
 
				-    0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,

			
 
				-GGML_TABLE_END()

			
 
				-

			
 
				-#define NGRID_IQ1S 2048

			
 
				-#define IQ1S_DELTA 0.125f

			
 
				-#define IQ1M_DELTA 0.125f

			
 
				-#if defined(GGML_COMMON_IMPL_C)

			
 
				-GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)

			
 
				-    0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,

			
 
				-    0xffffffffffff0101, 0xffffffffff00ff00, 0xffffffffff000000, 0xffffffffff01ffff,

			
 
				-    0xffffffffff01ff01, 0xffffffffff0101ff, 0xffffffffff010101, 0xffffffff00ff0000,

			
 
				-    0xffffffff0000ff00, 0xffffffff000000ff, 0xffffffff00000001, 0xffffffff00010000,

			
 
				-    0xffffffff01ffffff, 0xffffffff01ffff01, 0xffffffff01ff01ff, 0xffffffff01ff0101,

			
 
				-    0xffffffff01000000, 0xffffffff0101ffff, 0xffffffff0101ff01, 0xffffffff010101ff,

			
 
				-    0xffffffff01010101, 0xffffff00ffff00ff, 0xffffff00ffff0000, 0xffffff00ff00ff00,

			
 
				-    0xffffff00ff0000ff, 0xffffff00ff000001, 0xffffff00ff000100, 0xffffff00ff000101,

			
 
				-    0xffffff00ff010000, 0xffffff0000ffff00, 0xffffff0000ff0001, 0xffffff0000ff0100,

			
 
				-    0xffffff000000ff01, 0xffffff0000000000, 0xffffff0000000101, 0xffffff000001ff00,

			
 
				-    0xffffff00000100ff, 0xffffff0000010001, 0xffffff00000101ff, 0xffffff0001ff0000,

			
 
				-    0xffffff000100ff00, 0xffffff00010000ff, 0xffffff0001000001, 0xffffff0001010000,

			
 
				-    0xffffff01ffffffff, 0xffffff01ffffff01, 0xffffff01ffff01ff, 0xffffff01ffff0101,

			
 
				-    0xffffff01ff000000, 0xffffff01ff01ffff, 0xffffff01ff01ff01, 0xffffff01ff0101ff,

			
 
				-    0xffffff01ff010101, 0xffffff0100ff0000, 0xffffff010000ff00, 0xffffff0100000100,

			
 
				-    0xffffff01000100ff, 0xffffff0100010100, 0xffffff0101ffffff, 0xffffff0101ffff01,

			
 
				-    0xffffff0101ff01ff, 0xffffff0101ff0101, 0xffffff010100ff00, 0xffffff0101000000,

			
 
				-    0xffffff0101000100, 0xffffff010101ffff, 0xffffff010101ff01, 0xffffff01010101ff,

			
 
				-    0xffffff0101010101, 0xffff00ffff00ff00, 0xffff00ffff0000ff, 0xffff00ffff000001,

			
 
				-    0xffff00ffff010000, 0xffff00ff00ffff00, 0xffff00ff00ff0100, 0xffff00ff00000000,

			
 
				-    0xffff00ff00000101, 0xffff00ff000100ff, 0xffff00ff00010000, 0xffff00ff0100ff00,

			
 
				-    0xffff00ff01000100, 0xffff00ff01010000, 0xffff0000ffffff00, 0xffff0000ffff00ff,

			
 
				-    0xffff0000ffff0000, 0xffff0000ffff0001, 0xffff0000ff000000, 0xffff0000ff0001ff,

			
 
				-    0xffff0000ff000101, 0xffff0000ff010100, 0xffff000000ffffff, 0xffff000000ff0000,

			
 
				-    0xffff000000ff0101, 0xffff00000000ffff, 0xffff00000000ff00, 0xffff0000000000ff,

			
 
				-    0xffff000000000000, 0xffff000000000001, 0xffff000000000100, 0xffff00000001ffff,

			
 
				-    0xffff00000001ff01, 0xffff000000010000, 0xffff0000000101ff, 0xffff000000010101,

			
 
				-    0xffff000001ffff00, 0xffff00000100ff00, 0xffff000001000000, 0xffff0000010001ff,

			
 
				-    0xffff000001000101, 0xffff00000101ff00, 0xffff0000010100ff, 0xffff000001010000,

			
 
				-    0xffff000001010001, 0xffff000001010100, 0xffff0001ff0000ff, 0xffff0001ff000100,

			
 
				-    0xffff000100ffff00, 0xffff000100ff00ff, 0xffff00010000ffff, 0xffff00010000ff01,

			
 
				-    0xffff000100000000, 0xffff0001000001ff, 0xffff00010001ffff, 0xffff00010001ff00,

			
 
				-    0xffff000100010001, 0xffff000100010100, 0xffff000101ff0000, 0xffff00010100ff00,

			
 
				-    0xffff0001010000ff, 0xffff000101000100, 0xffff01ffffffffff, 0xffff01ffffffff01,

			
 
				-    0xffff01ffffff01ff, 0xffff01ffffff0101, 0xffff01ffff000000, 0xffff01ffff01ffff,

			
 
				-    0xffff01ffff01ff01, 0xffff01ffff0101ff, 0xffff01ffff010101, 0xffff01ff00ff0000,

			
 
				-    0xffff01ff0000ff00, 0xffff01ff00000001, 0xffff01ff00010000, 0xffff01ff01ffffff,

			
 
				-    0xffff01ff01ffff01, 0xffff01ff01ff01ff, 0xffff01ff01ff0101, 0xffff01ff01000000,

			
 
				-    0xffff01ff0101ffff, 0xffff01ff0101ff01, 0xffff01ff010101ff, 0xffff01ff01010101,

			
 
				-    0xffff0100ffff0000, 0xffff0100ff00ff00, 0xffff0100ff0000ff, 0xffff0100ff000100,

			
 
				-    0xffff0100ff0100ff, 0xffff0100ff010000, 0xffff010000ffff00, 0xffff01000000ffff,

			
 
				-    0xffff01000000ff00, 0xffff010000000000, 0xffff01000001ff00, 0xffff0100000100ff,

			
 
				-    0xffff010000010100, 0xffff01000100ff00, 0xffff0100010000ff, 0xffff010001000001,

			
 
				-    0xffff010001000100, 0xffff010001010000, 0xffff0101ffffffff, 0xffff0101ffffff01,

			
 
				-    0xffff0101ffff01ff, 0xffff0101ffff0101, 0xffff0101ff000000, 0xffff0101ff01ffff,

			
 
				-    0xffff0101ff01ff01, 0xffff0101ff0101ff, 0xffff0101ff010101, 0xffff010100ff0000,

			
 
				-    0xffff01010000ff00, 0xffff010100000100, 0xffff01010001ff00, 0xffff010100010000,

			
 
				-    0xffff010101ffffff, 0xffff010101ffff01, 0xffff010101ff0000, 0xffff010101ff01ff,

			
 
				-    0xffff010101ff0101, 0xffff010101000000, 0xffff01010101ffff, 0xffff01010101ff01,

			
 
				-    0xffff0101010101ff, 0xffff010101010101, 0xff00ffffff00ffff, 0xff00ffffff00ff00,

			
 
				-    0xff00ffffff0000ff, 0xff00ffffff000100, 0xff00ffffff0100ff, 0xff00ffffff010000,

			
 
				-    0xff00ffff00ffff00, 0xff00ffff00ff00ff, 0xff00ffff0000ffff, 0xff00ffff00000000,

			
 
				-    0xff00ffff000001ff, 0xff00ffff0001ff00, 0xff00ffff000100ff, 0xff00ffff00010000,

			
 
				-    0xff00ffff00010100, 0xff00ffff0100ff00, 0xff00ffff010000ff, 0xff00ffff01000001,

			
 
				-    0xff00ffff0101ff00, 0xff00ffff01010000, 0xff00ff00ffffff00, 0xff00ff00ffff00ff,

			
 
				-    0xff00ff00ffff0001, 0xff00ff00ffff0100, 0xff00ff00ff00ffff, 0xff00ff00ff00ff01,

			
 
				-    0xff00ff00ff000000, 0xff00ff00ff0001ff, 0xff00ff00ff01ff00, 0xff00ff00ff0100ff,

			
 
				-    0xff00ff00ff010100, 0xff00ff0000ff0000, 0xff00ff0000ff0101, 0xff00ff000000ffff,

			
 
				-    0xff00ff000000ff00, 0xff00ff000000ff01, 0xff00ff00000000ff, 0xff00ff0000000000,

			
 
				-    0xff00ff0000000001, 0xff00ff0000000100, 0xff00ff000001ffff, 0xff00ff0000010000,

			
 
				-    0xff00ff0001ff00ff, 0xff00ff000100ff01, 0xff00ff0001000000, 0xff00ff000101ff00,

			
 
				-    0xff00ff00010100ff, 0xff00ff01ff00ff00, 0xff00ff01ff0000ff, 0xff00ff01ff000001,

			
 
				-    0xff00ff01ff010000, 0xff00ff0100ffffff, 0xff00ff0100ff0001, 0xff00ff0100ff0100,

			
 
				-    0xff00ff010000ff01, 0xff00ff0100000000, 0xff00ff01000001ff, 0xff00ff0100000101,

			
 
				-    0xff00ff01000100ff, 0xff00ff0100010001, 0xff00ff0101ff0000, 0xff00ff010100ff00,

			
 
				-    0xff00ff01010000ff, 0xff00ff0101000001, 0xff00ff0101010000, 0xff0000ffffffff00,

			
 
				-    0xff0000ffffff0001, 0xff0000ffffff0100, 0xff0000ffff0000ff, 0xff0000ffff000000,

			
 
				-    0xff0000ffff0001ff, 0xff0000ffff000100, 0xff0000ffff01ff00, 0xff0000ffff010001,

			
 
				-    0xff0000ff00ffff00, 0xff0000ff00ff0000, 0xff0000ff00ff0001, 0xff0000ff00ff01ff,

			
 
				-    0xff0000ff00ff0101, 0xff0000ff0000ff00, 0xff0000ff000000ff, 0xff0000ff00000000,

			
 
				-    0xff0000ff00000001, 0xff0000ff00000100, 0xff0000ff0001ff01, 0xff0000ff00010000,

			
 
				-    0xff0000ff000101ff, 0xff0000ff01ff00ff, 0xff0000ff01ff0100, 0xff0000ff0100ffff,

			
 
				-    0xff0000ff010000ff, 0xff0000ff01000000, 0xff0000ff010001ff, 0xff0000ff01000100,

			
 
				-    0xff0000ff01000101, 0xff0000ff0101ff00, 0xff0000ff010100ff, 0xff0000ff01010000,

			
 
				-    0xff0000ff01010100, 0xff000000ffffff01, 0xff000000ffff0000, 0xff000000ffff0101,

			
 
				-    0xff000000ff00ff00, 0xff000000ff0000ff, 0xff000000ff000000, 0xff000000ff000001,

			
 
				-    0xff000000ff000100, 0xff000000ff01ffff, 0xff000000ff01ff01, 0xff000000ff010000,

			
 
				-    0xff000000ff0101ff, 0xff000000ff010101, 0xff00000000ffff00, 0xff00000000ff00ff,

			
 
				-    0xff00000000ff0000, 0xff00000000ff0001, 0xff0000000000ff00, 0xff0000000000ff01,

			
 
				-    0xff000000000000ff, 0xff00000000000000, 0xff00000000000001, 0xff00000000000100,

			
 
				-    0xff00000000000101, 0xff0000000001ff00, 0xff000000000100ff, 0xff00000000010000,

			
 
				-    0xff00000000010001, 0xff00000000010100, 0xff00000001ffffff, 0xff00000001ffff01,

			
 
				-    0xff00000001ff00ff, 0xff00000001ff0000, 0xff00000001ff01ff, 0xff00000001ff0101,

			
 
				-    0xff0000000100ffff, 0xff0000000100ff00, 0xff000000010000ff, 0xff00000001000000,

			
 
				-    0xff00000001000001, 0xff00000001000100, 0xff00000001000101, 0xff0000000101ffff,

			
 
				-    0xff0000000101ff01, 0xff00000001010000, 0xff000001ffffff00, 0xff000001ffff00ff,

			
 
				-    0xff000001ffff0000, 0xff000001ffff0001, 0xff000001ff000000, 0xff000001ff000001,

			
 
				-    0xff000001ff0001ff, 0xff000001ff000101, 0xff000001ff01ff00, 0xff000001ff010001,

			
 
				-    0xff00000100ffffff, 0xff00000100ffff01, 0xff00000100ff00ff, 0xff00000100ff0000,

			
 
				-    0xff00000100ff01ff, 0xff00000100ff0101, 0xff0000010000ff00, 0xff00000100000000,

			
 
				-    0xff00000100000001, 0xff000001000001ff, 0xff00000100000100, 0xff0000010001ff00,

			
 
				-    0xff000001000100ff, 0xff00000100010000, 0xff000001000101ff, 0xff00000100010100,

			
 
				-    0xff00000100010101, 0xff00000101ff0001, 0xff00000101ff0101, 0xff0000010100ff01,

			
 
				-    0xff00000101000000, 0xff000001010100ff, 0xff00000101010100, 0xff0001ffff00ff00,

			
 
				-    0xff0001ffff000001, 0xff0001ffff010000, 0xff0001ff00ffff00, 0xff0001ff00ff00ff,

			
 
				-    0xff0001ff00ff0001, 0xff0001ff00ff0100, 0xff0001ff0000ffff, 0xff0001ff00000000,

			
 
				-    0xff0001ff000001ff, 0xff0001ff00000101, 0xff0001ff0001ffff, 0xff0001ff0001ff00,

			
 
				-    0xff0001ff000100ff, 0xff0001ff00010001, 0xff0001ff00010100, 0xff0001ff01ff0000,

			
 
				-    0xff0001ff0100ff00, 0xff0001ff010000ff, 0xff0001ff01010000, 0xff000100ff00ffff,

			
 
				-    0xff000100ff00ff01, 0xff000100ff000000, 0xff000100ff000101, 0xff000100ff01ff00,

			
 
				-    0xff000100ff010000, 0xff00010000ffff01, 0xff00010000ff00ff, 0xff00010000ff0000,

			
 
				-    0xff00010000ff01ff, 0xff0001000000ff00, 0xff000100000000ff, 0xff00010000000000,

			
 
				-    0xff00010000000001, 0xff00010000000100, 0xff00010000000101, 0xff0001000001ffff,

			
 
				-    0xff00010000010000, 0xff00010000010101, 0xff00010001ff0100, 0xff0001000100ff00,

			
 
				-    0xff0001000100ff01, 0xff00010001000000, 0xff000100010001ff, 0xff0001000101ff00,

			
 
				-    0xff00010001010001, 0xff00010001010100, 0xff000101ffff0100, 0xff000101ff000001,

			
 
				-    0xff000101ff0100ff, 0xff000101ff010001, 0xff00010100ff00ff, 0xff00010100ff0001,

			
 
				-    0xff00010100ff0100, 0xff0001010000ffff, 0xff0001010000ff01, 0xff00010100000000,

			
 
				-    0xff000101000001ff, 0xff0001010001ff00, 0xff00010100010001, 0xff00010100010100,

			
 
				-    0xff00010101ff0000, 0xff0001010100ff00, 0xff00010101000001, 0xff00010101000101,

			
 
				-    0xff01ffffffffffff, 0xff01ffffffffff01, 0xff01ffffffff01ff, 0xff01ffffffff0101,

			
 
				-    0xff01ffffff000000, 0xff01ffffff01ffff, 0xff01ffffff01ff01, 0xff01ffffff010000,

			
 
				-    0xff01ffffff0101ff, 0xff01ffffff010101, 0xff01ffff00ff0000, 0xff01ffff0000ff00,

			
 
				-    0xff01ffff00000100, 0xff01ffff0001ff00, 0xff01ffff00010000, 0xff01ffff01ffffff,

			
 
				-    0xff01ffff01ffff01, 0xff01ffff01ff01ff, 0xff01ffff01ff0101, 0xff01ffff01000000,

			
 
				-    0xff01ffff0101ffff, 0xff01ffff0101ff01, 0xff01ffff01010000, 0xff01ffff010101ff,

			
 
				-    0xff01ffff01010101, 0xff01ff00ffff0000, 0xff01ff00ff00ff00, 0xff01ff00ff0000ff,

			
 
				-    0xff01ff00ff000100, 0xff01ff00ff010000, 0xff01ff0000ffff01, 0xff01ff0000ff00ff,

			
 
				-    0xff01ff0000ff0100, 0xff01ff0000000000, 0xff01ff00000001ff, 0xff01ff0000000101,

			
 
				-    0xff01ff000001ff00, 0xff01ff00000100ff, 0xff01ff0000010000, 0xff01ff0000010001,

			
 
				-    0xff01ff0001ff0000, 0xff01ff000100ffff, 0xff01ff0001000001, 0xff01ff0001000100,

			
 
				-    0xff01ff0001010000, 0xff01ff01ffffff00, 0xff01ff01ffff01ff, 0xff01ff01ffff0101,

			
 
				-    0xff01ff01ff00ff00, 0xff01ff01ff000000, 0xff01ff01ff01ffff, 0xff01ff01ff01ff01,

			
 
				-    0xff01ff01ff0101ff, 0xff01ff01ff010101, 0xff01ff0100ff0000, 0xff01ff010000ff00,

			
 
				-    0xff01ff0100000001, 0xff01ff0100000100, 0xff01ff0100010000, 0xff01ff0101ffff00,

			
 
				-    0xff01ff0101ff01ff, 0xff01ff0101ff0101, 0xff01ff010100ff00, 0xff01ff0101000000,

			
 
				-    0xff01ff010101ffff, 0xff01ff010101ff01, 0xff01ff01010101ff, 0xff01ff0101010101,

			
 
				-    0xff0100ffffff0000, 0xff0100ffff0000ff, 0xff0100ffff000001, 0xff0100ffff000100,

			
 
				-    0xff0100ffff010000, 0xff0100ff00ff00ff, 0xff0100ff00ff0000, 0xff0100ff00ff0001,

			
 
				-    0xff0100ff00ff0100, 0xff0100ff0000ff01, 0xff0100ff00000000, 0xff0100ff000001ff,

			
 
				-    0xff0100ff00000101, 0xff0100ff00010001, 0xff0100ff01ff0000, 0xff0100ff0100ff00,

			
 
				-    0xff0100ff010000ff, 0xff0100ff01000100, 0xff0100ff0101ff00, 0xff0100ff01010000,

			
 
				-    0xff010000ffff0100, 0xff010000ff000000, 0xff010000ff01ff00, 0xff010000ff010100,

			
 
				-    0xff01000000ffffff, 0xff01000000ff0000, 0xff01000000ff01ff, 0xff0100000000ff00,

			
 
				-    0xff010000000000ff, 0xff01000000000000, 0xff01000000000100, 0xff0100000001ff01,

			
 
				-    0xff01000000010000, 0xff010000000101ff, 0xff01000001ff0100, 0xff0100000100ffff,

			
 
				-    0xff010000010000ff, 0xff01000001000000, 0xff010000010001ff, 0xff01000001000101,

			
 
				-    0xff0100000101ff00, 0xff010000010100ff, 0xff01000001010001, 0xff01000001010100,

			
 
				-    0xff010001ffff0000, 0xff010001ff00ffff, 0xff010001ff00ff01, 0xff010001ff000100,

			
 
				-    0xff010001ff010000, 0xff01000100ffff00, 0xff01000100ff0100, 0xff01000100000000,

			
 
				-    0xff0100010001ffff, 0xff0100010001ff00, 0xff01000100010100, 0xff01000101ff00ff,

			
 
				-    0xff01000101ff0001, 0xff0100010100ffff, 0xff01000101000101, 0xff0101ffffffffff,

			
 
				-    0xff0101ffffffff01, 0xff0101ffffff01ff, 0xff0101ffffff0101, 0xff0101ffff000000,

			
 
				-    0xff0101ffff01ffff, 0xff0101ffff01ff01, 0xff0101ffff0101ff, 0xff0101ffff010101,

			
 
				-    0xff0101ff00ff0000, 0xff0101ff0000ff00, 0xff0101ff000000ff, 0xff0101ff00010000,

			
 
				-    0xff0101ff01ffffff, 0xff0101ff01ffff01, 0xff0101ff01ff01ff, 0xff0101ff01ff0101,

			
 
				-    0xff0101ff0101ffff, 0xff0101ff0101ff01, 0xff0101ff010101ff, 0xff0101ff01010101,

			
 
				-    0xff010100ffff0100, 0xff010100ff00ff00, 0xff010100ff0000ff, 0xff010100ff000100,

			
 
				-    0xff010100ff010000, 0xff01010000ff0001, 0xff01010000ff0100, 0xff0101000000ff01,

			
 
				-    0xff01010000000000, 0xff0101000001ff00, 0xff010100000100ff, 0xff01010000010001,

			
 
				-    0xff01010000010100, 0xff01010001ff0000, 0xff0101000100ffff, 0xff01010001000001,

			
 
				-    0xff01010001000100, 0xff010100010100ff, 0xff01010001010000, 0xff010101ffffffff,

			
 
				-    0xff010101ffffff01, 0xff010101ffff01ff, 0xff010101ffff0101, 0xff010101ff01ffff,

			
 
				-    0xff010101ff01ff01, 0xff010101ff0101ff, 0xff010101ff010101, 0xff01010100ff0000,

			
 
				-    0xff0101010000ff00, 0xff01010100000001, 0xff01010100000100, 0xff01010100010000,

			
 
				-    0xff01010101ffffff, 0xff01010101ffff01, 0xff01010101ff01ff, 0xff01010101ff0101,

			
 
				-    0xff01010101000000, 0xff0101010101ffff, 0xff0101010101ff01, 0xff010101010101ff,

			
 
				-    0xff01010101010101, 0x00ffffffffff0000, 0x00ffffffff00ff00, 0x00ffffffff000001,

			
 
				-    0x00ffffffff010000, 0x00ffffff00ff0100, 0x00ffffff0000ff01, 0x00ffffff00000000,

			
 
				-    0x00ffffff000001ff, 0x00ffffff00000101, 0x00ffffff0001ff00, 0x00ffffff000100ff,

			
 
				-    0x00ffffff00010001, 0x00ffffff010000ff, 0x00ffffff01000100, 0x00ffffff0101ff00,

			
 
				-    0x00ffffff01010001, 0x00ffff00ffffffff, 0x00ffff00ffffff00, 0x00ffff00ffff00ff,

			
 
				-    0x00ffff00ffff0001, 0x00ffff00ffff0100, 0x00ffff00ff00ff01, 0x00ffff00ff000000,

			
 
				-    0x00ffff00ff000001, 0x00ffff00ff0001ff, 0x00ffff00ff000101, 0x00ffff00ff01ff00,

			
 
				-    0x00ffff00ff010001, 0x00ffff00ff010100, 0x00ffff0000ff0000, 0x00ffff0000ff01ff,

			
 
				-    0x00ffff0000ff0101, 0x00ffff000000ff00, 0x00ffff00000000ff, 0x00ffff0000000000,

			
 
				-    0x00ffff0000000001, 0x00ffff0000000100, 0x00ffff0000000101, 0x00ffff0000010000,

			
 
				-    0x00ffff00000101ff, 0x00ffff0000010101, 0x00ffff0001ffff00, 0x00ffff0001ff00ff,

			
 
				-    0x00ffff0001ff0001, 0x00ffff000100ffff, 0x00ffff000100ff01, 0x00ffff0001000000,

			
 
				-    0x00ffff000101ffff, 0x00ffff000101ff00, 0x00ffff000101ff01, 0x00ffff01ffff0000,

			
 
				-    0x00ffff01ff00ff00, 0x00ffff01ff0000ff, 0x00ffff01ff000001, 0x00ffff01ff010000,

			
 
				-    0x00ffff0100ffff00, 0x00ffff010000ff01, 0x00ffff0100000000, 0x00ffff0100000101,

			
 
				-    0x00ffff01000100ff, 0x00ffff0100010100, 0x00ffff0101ff0100, 0x00ffff01010000ff,

			
 
				-    0x00ffff0101010000, 0x00ff00ffffffff00, 0x00ff00ffff000000, 0x00ff00ffff000100,

			
 
				-    0x00ff00ffff010100, 0x00ff00ff00ff0000, 0x00ff00ff00ff01ff, 0x00ff00ff00ff0101,

			
 
				-    0x00ff00ff0000ff00, 0x00ff00ff000000ff, 0x00ff00ff00000000, 0x00ff00ff00000001,

			
 
				-    0x00ff00ff0001ff00, 0x00ff00ff0001ff01, 0x00ff00ff00010000, 0x00ff00ff000101ff,

			
 
				-    0x00ff00ff00010101, 0x00ff00ff01ffff00, 0x00ff00ff01ff0001, 0x00ff00ff01ff0100,

			
 
				-    0x00ff00ff0100ffff, 0x00ff00ff0100ff01, 0x00ff00ff01000000, 0x00ff00ff0101ffff,

			
 
				-    0x00ff00ff0101ff00, 0x00ff00ff01010100, 0x00ff0000ffffff00, 0x00ff0000ffffff01,

			
 
				-    0x00ff0000ffff0000, 0x00ff0000ffff0101, 0x00ff0000ff00ff00, 0x00ff0000ff0000ff,

			
 
				-    0x00ff0000ff000000, 0x00ff0000ff000001, 0x00ff0000ff000100, 0x00ff0000ff01ffff,

			
 
				-    0x00ff0000ff010000, 0x00ff0000ff010101, 0x00ff000000ffff00, 0x00ff000000ff00ff,

			
 
				-    0x00ff000000ff0000, 0x00ff000000ff0001, 0x00ff000000ff0100, 0x00ff00000000ffff,

			
 
				-    0x00ff00000000ff00, 0x00ff0000000000ff, 0x00ff000000000000, 0x00ff000000000001,

			
 
				-    0x00ff0000000001ff, 0x00ff000000000100, 0x00ff00000001ff00, 0x00ff0000000100ff,

			
 
				-    0x00ff000000010000, 0x00ff000000010001, 0x00ff000000010100, 0x00ff000001ffff01,

			
 
				-    0x00ff000001ff00ff, 0x00ff000001ff0000, 0x00ff000001ff01ff, 0x00ff00000100ff00,

			
 
				-    0x00ff0000010000ff, 0x00ff000001000000, 0x00ff000001000001, 0x00ff000001000100,

			
 
				-    0x00ff000001000101, 0x00ff000001010000, 0x00ff0000010101ff, 0x00ff000001010101,

			
 
				-    0x00ff0001ffffff00, 0x00ff0001ffff0000, 0x00ff0001ffff0100, 0x00ff0001ff0000ff,

			
 
				-    0x00ff0001ff000000, 0x00ff0001ff0001ff, 0x00ff0001ff000101, 0x00ff0001ff01ff00,

			
 
				-    0x00ff0001ff0100ff, 0x00ff0001ff010100, 0x00ff000100ffffff, 0x00ff000100ffff01,

			
 
				-    0x00ff000100ff0000, 0x00ff000100ff01ff, 0x00ff00010000ffff, 0x00ff00010000ff00,

			
 
				-    0x00ff00010000ff01, 0x00ff000100000000, 0x00ff000100000001, 0x00ff000100000100,

			
 
				-    0x00ff00010001ff01, 0x00ff000100010000, 0x00ff0001000101ff, 0x00ff000101ffff00,

			
 
				-    0x00ff000101ff0000, 0x00ff000101ff0101, 0x00ff0001010000ff, 0x00ff000101000000,

			
 
				-    0x00ff00010101ff00, 0x00ff0001010100ff, 0x00ff000101010001, 0x00ff01ffffff0000,

			
 
				-    0x00ff01ffff00ff00, 0x00ff01ffff000000, 0x00ff01ffff000101, 0x00ff01ffff010000,

			
 
				-    0x00ff01ff00ffff01, 0x00ff01ff00ff0100, 0x00ff01ff0000ffff, 0x00ff01ff00000000,

			
 
				-    0x00ff01ff000001ff, 0x00ff01ff0001ff00, 0x00ff01ff000100ff, 0x00ff01ff00010001,

			
 
				-    0x00ff01ff00010100, 0x00ff01ff01ff0000, 0x00ff01ff0100ff00, 0x00ff01ff010000ff,

			
 
				-    0x00ff01ff01000001, 0x00ff01ff01000100, 0x00ff01ff01010000, 0x00ff0100ffffff00,

			
 
				-    0x00ff0100ffff0000, 0x00ff0100ffff0001, 0x00ff0100ffff0101, 0x00ff0100ff00ffff,

			
 
				-    0x00ff0100ff0000ff, 0x00ff0100ff000000, 0x00ff0100ff0001ff, 0x00ff0100ff01ff00,

			
 
				-    0x00ff0100ff0100ff, 0x00ff0100ff010001, 0x00ff010000ffffff, 0x00ff010000ff0000,

			
 
				-    0x00ff010000ff0101, 0x00ff01000000ff00, 0x00ff01000000ff01, 0x00ff0100000000ff,

			
 
				-    0x00ff010000000000, 0x00ff010000000001, 0x00ff010000000100, 0x00ff01000001ffff,

			
 
				-    0x00ff01000001ff01, 0x00ff010000010000, 0x00ff010000010001, 0x00ff010000010101,

			
 
				-    0x00ff010001ff0001, 0x00ff010001ff0100, 0x00ff01000100ff01, 0x00ff010001000000,

			
 
				-    0x00ff010001000001, 0x00ff0100010001ff, 0x00ff01000101ff00, 0x00ff0100010100ff,

			
 
				-    0x00ff010001010001, 0x00ff010001010100, 0x00ff0101ff000001, 0x00ff010100ff00ff,

			
 
				-    0x00ff010100ff0001, 0x00ff010100ff0100, 0x00ff010100000000, 0x00ff0101000001ff,

			
 
				-    0x00ff010100000101, 0x00ff0101000100ff, 0x00ff010100010100, 0x00ff0101010000ff,

			
 
				-    0x00ff010101010000, 0x0000ffffffffff00, 0x0000ffffffff00ff, 0x0000ffffffff0000,

			
 
				-    0x0000ffffffff0001, 0x0000ffffffff0100, 0x0000ffffff00ff01, 0x0000ffffff000000,

			
 
				-    0x0000ffffff000101, 0x0000ffffff01ff00, 0x0000ffffff0100ff, 0x0000ffffff010100,

			
 
				-    0x0000ffff00ffffff, 0x0000ffff00ff0000, 0x0000ffff00ff01ff, 0x0000ffff0000ff00,

			
 
				-    0x0000ffff000000ff, 0x0000ffff00000000, 0x0000ffff00000001, 0x0000ffff00000100,

			
 
				-    0x0000ffff00010000, 0x0000ffff000101ff, 0x0000ffff01ff0001, 0x0000ffff01ff0100,

			
 
				-    0x0000ffff01000000, 0x0000ffff010001ff, 0x0000ffff0101ffff, 0x0000ffff0101ff00,

			
 
				-    0x0000ffff01010001, 0x0000ffff01010100, 0x0000ff00ffff0000, 0x0000ff00ffff01ff,

			
 
				-    0x0000ff00ffff0100, 0x0000ff00ffff0101, 0x0000ff00ff00ff00, 0x0000ff00ff0000ff,

			
 
				-    0x0000ff00ff000000, 0x0000ff00ff000001, 0x0000ff00ff0001ff, 0x0000ff00ff000100,

			
 
				-    0x0000ff00ff01ffff, 0x0000ff00ff010000, 0x0000ff00ff010001, 0x0000ff00ff0101ff,

			
 
				-    0x0000ff00ff010101, 0x0000ff0000ffff00, 0x0000ff0000ff00ff, 0x0000ff0000ff0000,

			
 
				-    0x0000ff0000ff0001, 0x0000ff0000ff0100, 0x0000ff000000ffff, 0x0000ff000000ff00,

			
 
				-    0x0000ff000000ff01, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,

			
 
				-    0x0000ff00000001ff, 0x0000ff0000000100, 0x0000ff0000000101, 0x0000ff000001ff00,

			
 
				-    0x0000ff00000100ff, 0x0000ff0000010000, 0x0000ff0000010001, 0x0000ff0000010100,

			
 
				-    0x0000ff0001ffff01, 0x0000ff0001ff0000, 0x0000ff000100ff00, 0x0000ff00010000ff,

			
 
				-    0x0000ff0001000000, 0x0000ff0001000001, 0x0000ff0001000100, 0x0000ff000101ffff,

			
 
				-    0x0000ff0001010000, 0x0000ff0001010101, 0x0000ff01ffffff00, 0x0000ff01ffff0001,

			
 
				-    0x0000ff01ff00ff01, 0x0000ff01ff000000, 0x0000ff01ff000101, 0x0000ff01ff01ff00,

			
 
				-    0x0000ff01ff0100ff, 0x0000ff0100ffff01, 0x0000ff0100ff0000, 0x0000ff0100ff0101,

			
 
				-    0x0000ff010000ff00, 0x0000ff01000000ff, 0x0000ff0100000000, 0x0000ff0100000001,

			
 
				-    0x0000ff0100000100, 0x0000ff010001ff01, 0x0000ff0100010000, 0x0000ff0101ff0000,

			
 
				-    0x0000ff010100ffff, 0x0000ff010100ff01, 0x0000ff0101000000, 0x0000ff0101000100,

			
 
				-    0x0000ff0101000101, 0x0000ff01010100ff, 0x000000ffffff00ff, 0x000000ffffff0000,

			
 
				-    0x000000ffff00ff00, 0x000000ffff0000ff, 0x000000ffff000000, 0x000000ffff000001,

			
 
				-    0x000000ffff0001ff, 0x000000ffff000100, 0x000000ffff01ff00, 0x000000ffff010000,

			
 
				-    0x000000ffff0101ff, 0x000000ffff010101, 0x000000ff00ffff00, 0x000000ff00ff00ff,

			
 
				-    0x000000ff00ff0000, 0x000000ff00ff0001, 0x000000ff00ff0100, 0x000000ff00ff0101,

			
 
				-    0x000000ff0000ffff, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,

			
 
				-    0x000000ff00000001, 0x000000ff000001ff, 0x000000ff00000100, 0x000000ff00000101,

			
 
				-    0x000000ff0001ff00, 0x000000ff0001ff01, 0x000000ff000100ff, 0x000000ff00010000,

			
 
				-    0x000000ff00010001, 0x000000ff00010100, 0x000000ff01ffffff, 0x000000ff01ff01ff,

			
 
				-    0x000000ff01ff0101, 0x000000ff0100ff00, 0x000000ff010000ff, 0x000000ff01000000,

			
 
				-    0x000000ff01000001, 0x000000ff01000100, 0x000000ff0101ff00, 0x000000ff010100ff,

			
 
				-    0x000000ff01010000, 0x000000ff01010101, 0x00000000ffffff00, 0x00000000ffffff01,

			
 
				-    0x00000000ffff00ff, 0x00000000ffff0000, 0x00000000ffff0001, 0x00000000ffff0100,

			
 
				-    0x00000000ff00ffff, 0x00000000ff00ff00, 0x00000000ff00ff01, 0x00000000ff0000ff,

			
 
				-    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff000101,

			
 
				-    0x00000000ff01ff00, 0x00000000ff0100ff, 0x00000000ff010000, 0x00000000ff010001,

			
 
				-    0x00000000ff010100, 0x0000000000ffffff, 0x0000000000ffff00, 0x0000000000ffff01,

			
 
				-    0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, 0x0000000000ff01ff,

			
 
				-    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,

			
 
				-    0x00000000000000ff, 0x0000000000000000, 0x0000000000000001, 0x00000000000001ff,

			
 
				-    0x0000000000000100, 0x0000000000000101, 0x000000000001ffff, 0x000000000001ff00,

			
 
				-    0x00000000000100ff, 0x0000000000010000, 0x0000000000010001, 0x00000000000101ff,

			
 
				-    0x0000000000010100, 0x0000000000010101, 0x0000000001ffff00, 0x0000000001ff00ff,

			
 
				-    0x0000000001ff0000, 0x0000000001ff0100, 0x0000000001ff0101, 0x000000000100ffff,

			
 
				-    0x000000000100ff00, 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001,

			
 
				-    0x00000000010001ff, 0x0000000001000100, 0x000000000101ff00, 0x00000000010100ff,

			
 
				-    0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x00000001ffffffff,

			
 
				-    0x00000001ffffff00, 0x00000001ffffff01, 0x00000001ffff00ff, 0x00000001ffff0001,

			
 
				-    0x00000001ffff01ff, 0x00000001ffff0100, 0x00000001ff00ff00, 0x00000001ff0000ff,

			
 
				-    0x00000001ff000000, 0x00000001ff0001ff, 0x00000001ff000100, 0x00000001ff01ffff,

			
 
				-    0x00000001ff01ff00, 0x00000001ff01ff01, 0x00000001ff0100ff, 0x00000001ff010000,

			
 
				-    0x00000001ff010001, 0x00000001ff0101ff, 0x00000001ff010100, 0x0000000100ffff00,

			
 
				-    0x0000000100ff0000, 0x0000000100ff0001, 0x0000000100ff01ff, 0x0000000100ff0100,

			
 
				-    0x0000000100ff0101, 0x000000010000ffff, 0x000000010000ff00, 0x000000010000ff01,

			
 
				-    0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, 0x00000001000001ff,

			
 
				-    0x0000000100000100, 0x0000000100000101, 0x000000010001ff00, 0x00000001000100ff,

			
 
				-    0x0000000100010000, 0x0000000100010100, 0x0000000101ffff01, 0x0000000101ff0000,

			
 
				-    0x0000000101ff0001, 0x0000000101ff01ff, 0x0000000101ff0100, 0x0000000101ff0101,

			
 
				-    0x000000010100ff00, 0x0000000101000000, 0x0000000101000101, 0x000000010101ff01,

			
 
				-    0x0000000101010000, 0x0000000101010001, 0x00000001010101ff, 0x0000000101010100,

			
 
				-    0x000001ffffff00ff, 0x000001ffffff0000, 0x000001ffffff0001, 0x000001ffffff0100,

			
 
				-    0x000001ffff00ffff, 0x000001ffff000000, 0x000001ffff0001ff, 0x000001ffff01ff00,

			
 
				-    0x000001ffff010101, 0x000001ff00ff0000, 0x000001ff00ff01ff, 0x000001ff00ff0101,

			
 
				-    0x000001ff0000ff00, 0x000001ff000000ff, 0x000001ff00000000, 0x000001ff00000001,

			
 
				-    0x000001ff000001ff, 0x000001ff00000100, 0x000001ff0001ffff, 0x000001ff0001ff01,

			
 
				-    0x000001ff000100ff, 0x000001ff00010000, 0x000001ff01ffff01, 0x000001ff01ff0100,

			
 
				-    0x000001ff0100ffff, 0x000001ff0100ff01, 0x000001ff01000000, 0x000001ff010001ff,

			
 
				-    0x000001ff0101ff00, 0x000001ff01010100, 0x00000100ffffff00, 0x00000100ffffff01,

			
 
				-    0x00000100ffff0000, 0x00000100ffff0101, 0x00000100ff00ff00, 0x00000100ff0000ff,

			
 
				-    0x00000100ff000000, 0x00000100ff000001, 0x00000100ff000100, 0x00000100ff010000,

			
 
				-    0x0000010000ffff00, 0x0000010000ff00ff, 0x0000010000ff0000, 0x0000010000ff0001,

			
 
				-    0x0000010000ff0100, 0x000001000000ffff, 0x000001000000ff00, 0x000001000000ff01,

			
 
				-    0x00000100000000ff, 0x0000010000000000, 0x0000010000000001, 0x00000100000001ff,

			
 
				-    0x0000010000000100, 0x0000010000000101, 0x000001000001ff00, 0x00000100000100ff,

			
 
				-    0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010001ffff00,

			
 
				-    0x0000010001ff0000, 0x0000010001ff0100, 0x000001000100ff00, 0x00000100010000ff,

			
 
				-    0x0000010001000000, 0x0000010001000001, 0x00000100010001ff, 0x0000010001000100,

			
 
				-    0x0000010001010000, 0x00000101ffff00ff, 0x00000101ffff01ff, 0x00000101ff000000,

			
 
				-    0x00000101ff000101, 0x00000101ff01ffff, 0x00000101ff010000, 0x00000101ff010001,

			
 
				-    0x00000101ff010100, 0x0000010100ff0000, 0x0000010100ff01ff, 0x0000010100ff0100,

			
 
				-    0x000001010000ff00, 0x0000010100000000, 0x0000010100000001, 0x00000101000001ff,

			
 
				-    0x0000010100000100, 0x000001010001ff01, 0x0000010100010000, 0x00000101000101ff,

			
 
				-    0x0000010100010101, 0x0000010101ffff00, 0x0000010101ff0101, 0x000001010100ff01,

			
 
				-    0x0000010101000000, 0x0000010101000001, 0x00000101010001ff, 0x0000010101000101,

			
 
				-    0x000001010101ff00, 0x0001ffffffff0000, 0x0001ffffff0000ff, 0x0001ffffff000001,

			
 
				-    0x0001ffffff000100, 0x0001ffffff010000, 0x0001ffff00ff00ff, 0x0001ffff0000ffff,

			
 
				-    0x0001ffff00000000, 0x0001ffff00000001, 0x0001ffff000001ff, 0x0001ffff00000101,

			
 
				-    0x0001ffff0001ff00, 0x0001ffff000100ff, 0x0001ffff00010001, 0x0001ffff00010100,

			
 
				-    0x0001ffff01ffff00, 0x0001ffff01000001, 0x0001ffff01010000, 0x0001ff00ffffff00,

			
 
				-    0x0001ff00ffff00ff, 0x0001ff00ffff0001, 0x0001ff00ffff0100, 0x0001ff00ff00ff01,

			
 
				-    0x0001ff00ff000000, 0x0001ff00ff01ff00, 0x0001ff00ff01ff01, 0x0001ff00ff010001,

			
 
				-    0x0001ff00ff010100, 0x0001ff0000ff0000, 0x0001ff0000ff0100, 0x0001ff000000ff00,

			
 
				-    0x0001ff0000000000, 0x0001ff0000000001, 0x0001ff0000000100, 0x0001ff0000010000,

			
 
				-    0x0001ff0000010001, 0x0001ff0000010101, 0x0001ff0001ff00ff, 0x0001ff0001ff0101,

			
 
				-    0x0001ff000100ff01, 0x0001ff0001000000, 0x0001ff000101ff00, 0x0001ff0001010001,

			
 
				-    0x0001ff0001010100, 0x0001ff01ff00ff00, 0x0001ff01ff000001, 0x0001ff01ff000100,

			
 
				-    0x0001ff0100ffffff, 0x0001ff0100ffff00, 0x0001ff0100ff0001, 0x0001ff0100000000,

			
 
				-    0x0001ff0100000001, 0x0001ff01000001ff, 0x0001ff010001ffff, 0x0001ff0101ff0000,

			
 
				-    0x0001ff010100ff00, 0x0001ff0101000001, 0x0001ff0101010000, 0x000100ffff00ff00,

			
 
				-    0x000100ffff00ff01, 0x000100ffff000000, 0x000100ffff000001, 0x000100ffff000101,

			
 
				-    0x000100ffff01ff00, 0x000100ffff010001, 0x000100ffff010100, 0x000100ff00ffffff,

			
 
				-    0x000100ff00ffff01, 0x000100ff00ff0000, 0x000100ff00ff01ff, 0x000100ff00ff0101,

			
 
				-    0x000100ff0000ff00, 0x000100ff000000ff, 0x000100ff00000000, 0x000100ff00000001,

			
 
				-    0x000100ff00000100, 0x000100ff00000101, 0x000100ff0001ffff, 0x000100ff0001ff01,

			
 
				-    0x000100ff00010000, 0x000100ff01ff00ff, 0x000100ff01ff0000, 0x000100ff01ff0100,

			
 
				-    0x000100ff0100ffff, 0x000100ff0100ff01, 0x000100ff010000ff, 0x000100ff01000000,

			
 
				-    0x000100ff01000001, 0x000100ff010001ff, 0x000100ff01000101, 0x000100ff0101ff00,

			
 
				-    0x000100ff010100ff, 0x000100ff01010100, 0x00010000ffff0000, 0x00010000ffff01ff,

			
 
				-    0x00010000ffff0101, 0x00010000ff00ff00, 0x00010000ff000000, 0x00010000ff000001,

			
 
				-    0x00010000ff000100, 0x0001000000ff00ff, 0x0001000000ff0000, 0x0001000000ff0001,

			
 
				-    0x0001000000ff0100, 0x000100000000ffff, 0x000100000000ff00, 0x00010000000000ff,

			
 
				-    0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x000100000001ff00,

			
 
				-    0x00010000000100ff, 0x0001000000010000, 0x0001000000010001, 0x0001000000010100,

			
 
				-    0x0001000001ff0001, 0x0001000001ff0100, 0x0001000001ff0101, 0x000100000100ff00,

			
 
				-    0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101,

			
 
				-    0x000100000101ff01, 0x0001000001010000, 0x0001000001010001, 0x00010000010101ff,

			
 
				-    0x00010001ffffff01, 0x00010001ffff0100, 0x00010001ff000000, 0x00010001ff01ffff,

			
 
				-    0x00010001ff010001, 0x00010001ff0101ff, 0x00010001ff010100, 0x0001000100ffffff,

			
 
				-    0x0001000100ff0000, 0x0001000100ff01ff, 0x0001000100ff0101, 0x000100010000ff00,

			
 
				-    0x00010001000000ff, 0x0001000100000000, 0x0001000100000001, 0x00010001000001ff,

			
 
				-    0x0001000100000101, 0x000100010001ffff, 0x0001000100010000, 0x00010001000101ff,

			
 
				-    0x0001000101ffffff, 0x0001000101ffff01, 0x0001000101ff0000, 0x0001000101ff0101,

			
 
				-    0x00010001010000ff, 0x0001000101000001, 0x00010001010001ff, 0x0001000101000100,

			
 
				-    0x000100010101ffff, 0x00010001010100ff, 0x0001000101010001, 0x0001000101010101,

			
 
				-    0x000101ffff000001, 0x000101ffff000100, 0x000101ffff010000, 0x000101ff00ffff00,

			
 
				-    0x000101ff0000ff01, 0x000101ff00000000, 0x000101ff00000101, 0x000101ff0001ff00,

			
 
				-    0x000101ff00010100, 0x000101ff01ff0000, 0x000101ff0100ff00, 0x000101ff010001ff,

			
 
				-    0x000101ff01010001, 0x00010100ffffff00, 0x00010100ffff00ff, 0x00010100ff00ffff,

			
 
				-    0x00010100ff000000, 0x00010100ff01ff00, 0x00010100ff0100ff, 0x00010100ff010001,

			
 
				-    0x00010100ff010100, 0x0001010000ffffff, 0x0001010000ffff00, 0x0001010000ff0000,

			
 
				-    0x0001010000ff0001, 0x0001010000ff01ff, 0x000101000000ff00, 0x00010100000000ff,

			
 
				-    0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x000101000001ffff,

			
 
				-    0x0001010000010000, 0x0001010000010101, 0x0001010001ffff01, 0x0001010001ff00ff,

			
 
				-    0x0001010001ff0101, 0x0001010001000000, 0x000101000101ff00, 0x00010100010100ff,

			
 
				-    0x0001010001010000, 0x0001010001010100, 0x00010101ff00ff00, 0x00010101ff000001,

			
 
				-    0x00010101ff0001ff, 0x0001010100ffff00, 0x0001010100ff00ff, 0x0001010100ff0100,

			
 
				-    0x000101010000ffff, 0x0001010100000000, 0x00010101000001ff, 0x0001010100000101,

			
 
				-    0x00010101000100ff, 0x0001010100010000, 0x0001010100010100, 0x0001010101ff0001,

			
 
				-    0x00010101010000ff, 0x00010101010001ff, 0x0001010101000101, 0x0001010101010001,

			
 
				-    0x01ffffffffffffff, 0x01ffffffffffff01, 0x01ffffffffff01ff, 0x01ffffffffff0101,

			
 
				-    0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, 0x01ffffffff010101,

			
 
				-    0x01ffffff00ff0000, 0x01ffffff0000ffff, 0x01ffffff0000ff00, 0x01ffffff000000ff,

			
 
				-    0x01ffffff00000001, 0x01ffffff00000100, 0x01ffffff00010000, 0x01ffffff01ffffff,

			
 
				-    0x01ffffff01ffff01, 0x01ffffff01ff01ff, 0x01ffffff01ff0101, 0x01ffffff01000000,

			
 
				-    0x01ffffff0101ffff, 0x01ffffff0101ff01, 0x01ffffff010101ff, 0x01ffffff01010101,

			
 
				-    0x01ffff00ffff0000, 0x01ffff00ff00ff00, 0x01ffff00ff0000ff, 0x01ffff00ff000001,

			
 
				-    0x01ffff00ff000100, 0x01ffff00ff010000, 0x01ffff0000ffff00, 0x01ffff0000ff00ff,

			
 
				-    0x01ffff0000ff0100, 0x01ffff000000ffff, 0x01ffff000000ff01, 0x01ffff0000000000,

			
 
				-    0x01ffff0000000001, 0x01ffff00000001ff, 0x01ffff0000000100, 0x01ffff00000100ff,

			
 
				-    0x01ffff0000010001, 0x01ffff0000010100, 0x01ffff0001ff0000, 0x01ffff0001ff0100,

			
 
				-    0x01ffff00010000ff, 0x01ffff0001000001, 0x01ffff0001000100, 0x01ffff0001010000,

			
 
				-    0x01ffff01ffffffff, 0x01ffff01ffffff01, 0x01ffff01ffff01ff, 0x01ffff01ffff0101,

			
 
				-    0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff01ff01, 0x01ffff01ff0101ff,

			
 
				-    0x01ffff01ff010101, 0x01ffff010000ff00, 0x01ffff01000000ff, 0x01ffff0100000100,

			
 
				-    0x01ffff0100010000, 0x01ffff0101ffffff, 0x01ffff0101ffff01, 0x01ffff0101ff01ff,

			
 
				-    0x01ffff0101ff0101, 0x01ffff0101000000, 0x01ffff010101ffff, 0x01ffff010101ff01,

			
 
				-    0x01ffff01010101ff, 0x01ffff0101010101, 0x01ff00ffff0000ff, 0x01ff00ffff000100,

			
 
				-    0x01ff00ff00ffff00, 0x01ff00ff00ff00ff, 0x01ff00ff0000ff00, 0x01ff00ff00000000,

			
 
				-    0x01ff00ff00000101, 0x01ff00ff0001ff00, 0x01ff00ff000100ff, 0x01ff00ff00010100,

			
 
				-    0x01ff00ff010000ff, 0x01ff00ff01000100, 0x01ff0000ffffff00, 0x01ff0000ffff0100,

			
 
				-    0x01ff0000ff00ff01, 0x01ff0000ff000000, 0x01ff0000ff000101, 0x01ff0000ff010001,

			
 
				-    0x01ff0000ff010100, 0x01ff000000ffffff, 0x01ff000000ffff00, 0x01ff000000ff0000,

			
 
				-    0x01ff000000ff01ff, 0x01ff00000000ff00, 0x01ff0000000000ff, 0x01ff000000000000,

			
 
				-    0x01ff000000000001, 0x01ff000000000100, 0x01ff000000000101, 0x01ff000000010000,

			
 
				-    0x01ff000000010001, 0x01ff0000000101ff, 0x01ff000000010101, 0x01ff000001ffff00,

			
 
				-    0x01ff000001ff00ff, 0x01ff000001ff0001, 0x01ff000001ff0100, 0x01ff00000100ffff,

			
 
				-    0x01ff00000100ff01, 0x01ff000001000000, 0x01ff0000010001ff, 0x01ff000001010001,

			
 
				-    0x01ff0001ff00ff00, 0x01ff0001ff000001, 0x01ff0001ff000100, 0x01ff0001ff010000,

			
 
				-    0x01ff000100ffff00, 0x01ff000100ff00ff, 0x01ff000100ff0100, 0x01ff000100ff0101,

			
 
				-    0x01ff00010000ffff, 0x01ff000100000000, 0x01ff000100000100, 0x01ff000100000101,

			
 
				-    0x01ff00010001ff00, 0x01ff000100010001, 0x01ff000100010101, 0x01ff000101ff0000,

			
 
				-    0x01ff00010100ff00, 0x01ff000101000101, 0x01ff0001010100ff, 0x01ff01ffffffffff,

			
 
				-    0x01ff01ffffffff01, 0x01ff01ffffff01ff, 0x01ff01ffffff0101, 0x01ff01ffff000000,

			
 
				-    0x01ff01ffff01ffff, 0x01ff01ffff01ff01, 0x01ff01ffff0101ff, 0x01ff01ffff010101,

			
 
				-    0x01ff01ff00ffff00, 0x01ff01ff00ff0000, 0x01ff01ff0000ff00, 0x01ff01ff000000ff,

			
 
				-    0x01ff01ff00000100, 0x01ff01ff00010000, 0x01ff01ff00010100, 0x01ff01ff01ffffff,

			
 
				-    0x01ff01ff01ffff01, 0x01ff01ff01ff01ff, 0x01ff01ff01ff0101, 0x01ff01ff01000000,

			
 
				-    0x01ff01ff0101ffff, 0x01ff01ff0101ff01, 0x01ff01ff010101ff, 0x01ff01ff01010101,

			
 
				-    0x01ff0100ffff0000, 0x01ff0100ffff0001, 0x01ff0100ff00ff00, 0x01ff0100ff0000ff,

			
 
				-    0x01ff0100ff000001, 0x01ff0100ff010000, 0x01ff010000ffff00, 0x01ff010000ff00ff,

			
 
				-    0x01ff010000ff0001, 0x01ff010000ff0100, 0x01ff01000000ffff, 0x01ff01000000ff01,

			
 
				-    0x01ff010000000000, 0x01ff010000000101, 0x01ff01000001ff00, 0x01ff0100000100ff,

			
 
				-    0x01ff010001ff0000, 0x01ff010001000001, 0x01ff010001000100, 0x01ff010001010000,

			
 
				-    0x01ff0101ffffffff, 0x01ff0101ffffff01, 0x01ff0101ffff01ff, 0x01ff0101ffff0101,

			
 
				-    0x01ff0101ff000000, 0x01ff0101ff01ffff, 0x01ff0101ff01ff01, 0x01ff0101ff0101ff,

			
 
				-    0x01ff0101ff010101, 0x01ff010100ff0000, 0x01ff01010000ff00, 0x01ff0101000000ff,

			
 
				-    0x01ff010100000001, 0x01ff010101ffffff, 0x01ff010101ffff01, 0x01ff010101ff01ff,

			
 
				-    0x01ff010101ff0101, 0x01ff010101000000, 0x01ff01010101ffff, 0x01ff01010101ff01,

			
 
				-    0x01ff0101010101ff, 0x01ff010101010101, 0x0100ffffffff0000, 0x0100ffffff00ff00,

			
 
				-    0x0100ffffff000001, 0x0100ffffff0001ff, 0x0100ffffff000100, 0x0100ffffff010000,

			
 
				-    0x0100ffff00ffff00, 0x0100ffff00ff0001, 0x0100ffff00ff0100, 0x0100ffff00000000,

			
 
				-    0x0100ffff000001ff, 0x0100ffff00000101, 0x0100ffff00010100, 0x0100ffff00010101,

			
 
				-    0x0100ffff01ff0000, 0x0100ffff0100ff00, 0x0100ffff010000ff, 0x0100ffff01000001,

			
 
				-    0x0100ffff01000100, 0x0100ffff01010000, 0x0100ff00ffffff00, 0x0100ff00ffff00ff,

			
 
				-    0x0100ff00ffff0001, 0x0100ff00ffff0100, 0x0100ff00ff00ffff, 0x0100ff00ff000000,

			
 
				-    0x0100ff00ff0001ff, 0x0100ff00ff000101, 0x0100ff00ff01ff00, 0x0100ff00ff0100ff,

			
 
				-    0x0100ff00ff010001, 0x0100ff00ff010100, 0x0100ff0000ffffff, 0x0100ff0000ff0000,

			
 
				-    0x0100ff000000ffff, 0x0100ff000000ff00, 0x0100ff00000000ff, 0x0100ff0000000000,

			
 
				-    0x0100ff0000000001, 0x0100ff0000000100, 0x0100ff000001ff01, 0x0100ff0000010000,

			
 
				-    0x0100ff0001ff00ff, 0x0100ff0001ff0001, 0x0100ff000100ff01, 0x0100ff0001000000,

			
 
				-    0x0100ff00010001ff, 0x0100ff000101ff00, 0x0100ff00010100ff, 0x0100ff0001010001,

			
 
				-    0x0100ff0001010100, 0x0100ff01ffff0000, 0x0100ff01ff00ff00, 0x0100ff01ff0000ff,

			
 
				-    0x0100ff01ff000100, 0x0100ff01ff010000, 0x0100ff0100ff00ff, 0x0100ff0100ff0001,

			
 
				-    0x0100ff0100ff0100, 0x0100ff010000ffff, 0x0100ff010000ff01, 0x0100ff0100000000,

			
 
				-    0x0100ff01000001ff, 0x0100ff0100010001, 0x0100ff0100010100, 0x0100ff0101ff0000,

			
 
				-    0x0100ff01010000ff, 0x0100ff0101000001, 0x0100ff0101010100, 0x010000ffffffff00,

			
 
				-    0x010000ffffff00ff, 0x010000ffffff0001, 0x010000ffff00ffff, 0x010000ffff000000,

			
 
				-    0x010000ffff0001ff, 0x010000ffff010001, 0x010000ff00ffffff, 0x010000ff00ff0101,

			
 
				-    0x010000ff0000ff00, 0x010000ff000000ff, 0x010000ff00000000, 0x010000ff00000001,

			
 
				-    0x010000ff000001ff, 0x010000ff00000100, 0x010000ff0001ffff, 0x010000ff0001ff00,

			
 
				-    0x010000ff0001ff01, 0x010000ff00010000, 0x010000ff01ff00ff, 0x010000ff01ff0001,

			
 
				-    0x010000ff0100ff01, 0x010000ff010000ff, 0x010000ff01000000, 0x010000ff010001ff,

			
 
				-    0x010000ff0101ff00, 0x010000ff01010100, 0x01000000ffffffff, 0x01000000ffff0000,

			
 
				-    0x01000000ffff01ff, 0x01000000ffff0101, 0x01000000ff00ffff, 0x01000000ff00ff00,

			
 
				-    0x01000000ff0000ff, 0x01000000ff000000, 0x01000000ff000001, 0x01000000ff000100,

			
 
				-    0x01000000ff01ff00, 0x01000000ff010000, 0x01000000ff010100, 0x01000000ff010101,

			
 
				-    0x0100000000ffff00, 0x0100000000ff00ff, 0x0100000000ff0000, 0x0100000000ff0001,

			
 
				-    0x0100000000ff0100, 0x010000000000ffff, 0x010000000000ff00, 0x010000000000ff01,

			
 
				-    0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, 0x01000000000001ff,

			
 
				-    0x0100000000000100, 0x0100000000000101, 0x010000000001ff00, 0x01000000000100ff,

			
 
				-    0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000001ffff00,

			
 
				-    0x0100000001ff0000, 0x0100000001ff01ff, 0x010000000100ff00, 0x010000000100ff01,

			
 
				-    0x01000000010000ff, 0x0100000001000000, 0x0100000001000001, 0x0100000001000100,

			
 
				-    0x0100000001000101, 0x010000000101ffff, 0x010000000101ff01, 0x0100000001010000,

			
 
				-    0x01000000010101ff, 0x0100000001010101, 0x01000001ffffff00, 0x01000001ffff00ff,

			
 
				-    0x01000001ff00ffff, 0x01000001ff000000, 0x01000001ff000100, 0x01000001ff01ffff,

			
 
				-    0x01000001ff010001, 0x01000001ff010100, 0x0100000100ff0000, 0x0100000100ff01ff,

			
 
				-    0x0100000100ff0100, 0x010000010000ff00, 0x010000010000ff01, 0x0100000100000000,

			
 
				-    0x0100000100000001, 0x0100000100000100, 0x0100000100010000, 0x01000001000101ff,

			
 
				-    0x0100000101ffff01, 0x0100000101ff00ff, 0x0100000101ff0100, 0x0100000101ff0101,

			
 
				-    0x010000010100ff01, 0x01000001010000ff, 0x0100000101000000, 0x01000001010100ff,

			
 
				-    0x0100000101010001, 0x0100000101010100, 0x010001ffffff0000, 0x010001ffff000001,

			
 
				-    0x010001ffff000100, 0x010001ffff010000, 0x010001ff00ffff00, 0x010001ff00ff0001,

			
 
				-    0x010001ff0000ffff, 0x010001ff0000ff01, 0x010001ff00000000, 0x010001ff00000001,

			
 
				-    0x010001ff00000101, 0x010001ff000100ff, 0x010001ff00010000, 0x010001ff01ff0000,

			
 
				-    0x010001ff0100ff00, 0x010001ff01000001, 0x010001ff01000100, 0x010001ff01010000,

			
 
				-    0x01000100ffff00ff, 0x01000100ffff0001, 0x01000100ffff0100, 0x01000100ff00ffff,

			
 
				-    0x01000100ff00ff01, 0x01000100ff000000, 0x01000100ff0001ff, 0x01000100ff000101,

			
 
				-    0x01000100ff01ffff, 0x01000100ff01ff00, 0x01000100ff0100ff, 0x01000100ff010001,

			
 
				-    0x0100010000ffffff, 0x0100010000ffff01, 0x0100010000ff0000, 0x0100010000ff01ff,

			
 
				-    0x0100010000ff0101, 0x010001000000ff00, 0x01000100000000ff, 0x0100010000000000,

			
 
				-    0x0100010000000001, 0x0100010000000100, 0x010001000001ff01, 0x0100010000010000,

			
 
				-    0x0100010000010001, 0x0100010000010101, 0x0100010001ffff00, 0x0100010001ff00ff,

			
 
				-    0x010001000100ffff, 0x010001000100ff01, 0x0100010001000000, 0x0100010001000101,

			
 
				-    0x010001000101ff00, 0x0100010001010001, 0x01000101ffff0000, 0x01000101ff000000,

			
 
				-    0x01000101ff010000, 0x0100010100ff00ff, 0x0100010100ff0001, 0x0100010100ff0100,

			
 
				-    0x010001010000ffff, 0x0100010100000000, 0x01000101000001ff, 0x010001010001ff00,

			
 
				-    0x0100010101ff0000, 0x010001010100ff00, 0x01000101010000ff, 0x0100010101000000,

			
 
				-    0x0100010101000001, 0x0101ffffffffffff, 0x0101ffffffffff01, 0x0101ffffffff01ff,

			
 
				-    0x0101ffffffff0101, 0x0101ffffff000000, 0x0101ffffff01ffff, 0x0101ffffff01ff01,

			
 
				-    0x0101ffffff0101ff, 0x0101ffffff010101, 0x0101ffff00ff0000, 0x0101ffff0000ff00,

			
 
				-    0x0101ffff000000ff, 0x0101ffff00000001, 0x0101ffff00000100, 0x0101ffff01ffffff,

			
 
				-    0x0101ffff01ffff01, 0x0101ffff01ff01ff, 0x0101ffff01ff0101, 0x0101ffff01000000,

			
 
				-    0x0101ffff0101ffff, 0x0101ffff0101ff01, 0x0101ffff010101ff, 0x0101ffff01010101,

			
 
				-    0x0101ff00ffff0000, 0x0101ff00ffff0100, 0x0101ff00ff00ff00, 0x0101ff00ff0000ff,

			
 
				-    0x0101ff00ff000001, 0x0101ff00ff000100, 0x0101ff00ff000101, 0x0101ff0000ff0001,

			
 
				-    0x0101ff0000ff0100, 0x0101ff000000ff00, 0x0101ff0000000000, 0x0101ff00000001ff,

			
 
				-    0x0101ff0000000101, 0x0101ff000001ff00, 0x0101ff00000100ff, 0x0101ff0001ff0000,

			
 
				-    0x0101ff000100ffff, 0x0101ff000100ff01, 0x0101ff0001000001, 0x0101ff0001000100,

			
 
				-    0x0101ff01ffffff01, 0x0101ff01ffff01ff, 0x0101ff01ffff0101, 0x0101ff01ff00ffff,

			
 
				-    0x0101ff01ff000100, 0x0101ff01ff01ff01, 0x0101ff01ff0101ff, 0x0101ff01ff010101,

			
 
				-    0x0101ff0100ff0000, 0x0101ff010000ff00, 0x0101ff0100000001, 0x0101ff0100000100,

			
 
				-    0x0101ff0100010000, 0x0101ff0101ffffff, 0x0101ff0101ffff01, 0x0101ff0101ff01ff,

			
 
				-    0x0101ff0101ff0101, 0x0101ff0101000000, 0x0101ff010101ffff, 0x0101ff010101ff01,

			
 
				-    0x0101ff01010101ff, 0x0101ff0101010101, 0x010100ffff000100, 0x010100ffff010000,

			
 
				-    0x010100ff00ffff00, 0x010100ff00ff00ff, 0x010100ff0000ffff, 0x010100ff000000ff,

			
 
				-    0x010100ff00000000, 0x010100ff000001ff, 0x010100ff00000101, 0x010100ff0001ff00,

			
 
				-    0x010100ff00010000, 0x010100ff00010001, 0x010100ff000101ff, 0x010100ff00010100,

			
 
				-    0x010100ff01ff0000, 0x01010000ffff0001, 0x01010000ffff0100, 0x01010000ff00ffff,

			
 
				-    0x01010000ff00ff01, 0x01010000ff000000, 0x01010000ff0001ff, 0x01010000ff010001,

			
 
				-    0x01010000ff010100, 0x0101000000ffff01, 0x0101000000ff0000, 0x010100000000ff00,

			
 
				-    0x01010000000000ff, 0x0101000000000000, 0x0101000000000001, 0x0101000000000100,

			
 
				-    0x0101000000010000, 0x0101000000010101, 0x0101000001ffff00, 0x0101000001ff00ff,

			
 
				-    0x0101000001ff0000, 0x0101000001ff0001, 0x0101000001ff0100, 0x010100000100ff01,

			
 
				-    0x0101000001000000, 0x01010000010001ff, 0x01010001ffff0000, 0x01010001ff00ff00,

			
 
				-    0x01010001ff000001, 0x01010001ff000101, 0x01010001ff01ff00, 0x01010001ff010000,

			
 
				-    0x0101000100ff00ff, 0x0101000100ff0001, 0x0101000100ff0101, 0x010100010000ff01,

			
 
				-    0x0101000100000000, 0x0101000100000001, 0x01010001000001ff, 0x010100010001ffff,

			
 
				-    0x010100010001ff01, 0x0101000101ff0001, 0x010100010100ffff, 0x0101000101000000,

			
 
				-    0x0101000101000001, 0x0101000101000100, 0x010100010101ff00, 0x01010001010100ff,

			
 
				-    0x0101000101010001, 0x010101ffffffffff, 0x010101ffffffff01, 0x010101ffffff01ff,

			
 
				-    0x010101ffffff0101, 0x010101ffff01ffff, 0x010101ffff01ff01, 0x010101ffff0101ff,

			
 
				-    0x010101ffff010101, 0x010101ff0000ff00, 0x010101ff000000ff, 0x010101ff00000001,

			
 
				-    0x010101ff00000100, 0x010101ff01ffffff, 0x010101ff01ffff01, 0x010101ff01ff01ff,

			
 
				-    0x010101ff01ff0101, 0x010101ff01000000, 0x010101ff0101ffff, 0x010101ff0101ff01,

			
 
				-    0x010101ff010101ff, 0x010101ff01010101, 0x01010100ffff0000, 0x01010100ff0000ff,

			
 
				-    0x01010100ff000100, 0x01010100ff01ff00, 0x01010100ff010000, 0x0101010000ffff00,

			
 
				-    0x010101000000ffff, 0x0101010000000000, 0x0101010000000101, 0x010101000001ff00,

			
 
				-    0x0101010000010001, 0x0101010000010100, 0x010101000100ffff, 0x0101010001000001,

			
 
				-    0x01010101ffffffff, 0x01010101ffffff01, 0x01010101ffff01ff, 0x01010101ffff0101,

			
 
				-    0x01010101ff01ffff, 0x01010101ff01ff01, 0x01010101ff0101ff, 0x01010101ff010101,

			
 
				-    0x010101010000ff00, 0x01010101000000ff, 0x0101010100000001, 0x0101010101ffffff,

			
 
				-    0x0101010101ffff01, 0x0101010101ff01ff, 0x0101010101ff0101, 0x0101010101000000,

			
 
				-    0x010101010101ffff, 0x010101010101ff01, 0x01010101010101ff, 0x0101010101010101,

			
 
				-GGML_TABLE_END()

			
 
				-#else

			
 
				-GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S)

			
 
				-    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,

			
 
				-    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,

			
 
				-    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,

			
 
				-    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,

			
 
				-    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,

			
 
				-    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,

			
 
				-    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,

			
 
				-    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,

			
 
				-    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,

			
 
				-    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,

			
 
				-    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,

			
 
				-    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,

			
 
				-    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,

			
 
				-    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,

			
 
				-    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,

			
 
				-    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,

			
 
				-    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,

			
 
				-    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,

			
 
				-    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,

			
 
				-    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,

			
 
				-    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,

			
 
				-    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,

			
 
				-    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,

			
 
				-    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,

			
 
				-    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,

			
 
				-    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,

			
 
				-    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,

			
 
				-    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,

			
 
				-    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,

			
 
				-    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,

			
 
				-    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,

			
 
				-    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,

			
 
				-    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,

			
 
				-    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,

			
 
				-    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,

			
 
				-    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,

			
 
				-    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,

			
 
				-    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,

			
 
				-    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,

			
 
				-    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,

			
 
				-    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,

			
 
				-    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,

			
 
				-    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,

			
 
				-    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,

			
 
				-    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,

			
 
				-    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,

			
 
				-    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,

			
 
				-    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,

			
 
				-    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,

			
 
				-    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,

			
 
				-    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,

			
 
				-    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,

			
 
				-    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,

			
 
				-    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,

			
 
				-    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,

			
 
				-    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,

			
 
				-    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,

			
 
				-    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,

			
 
				-    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,

			
 
				-    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,

			
 
				-    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,

			
 
				-    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,

			
 
				-    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,

			
 
				-    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,

			
 
				-    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,

			
 
				-    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,

			
 
				-    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,

			
 
				-    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,

			
 
				-    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,

			
 
				-    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,

			
 
				-    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,

			
 
				-    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,

			
 
				-    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,

			
 
				-    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,

			
 
				-    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,

			
 
				-    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,

			
 
				-    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,

			
 
				-    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,

			
 
				-    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,

			
 
				-    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,

			
 
				-    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,

			
 
				-    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,

			
 
				-    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,

			
 
				-    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,

			
 
				-    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,

			
 
				-    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,

			
 
				-    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,

			
 
				-    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,

			
 
				-    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,

			
 
				-    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,

			
 
				-    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,

			
 
				-    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,

			
 
				-    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,

			
 
				-    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,

			
 
				-    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,

			
 
				-    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,

			
 
				-    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,

			
 
				-    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,

			
 
				-    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,

			
 
				-    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,

			
 
				-    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,

			
 
				-    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,

			
 
				-    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,

			
 
				-    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,

			
 
				-    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,

			
 
				-    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,

			
 
				-    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,

			
 
				-    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,

			
 
				-    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,

			
 
				-    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,

			
 
				-    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,

			
 
				-    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,

			
 
				-    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,

			
 
				-    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,

			
 
				-    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,

			
 
				-    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,

			
 
				-    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,

			
 
				-    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,

			
 
				-    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,

			
 
				-    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,

			
 
				-    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,

			
 
				-    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,

			
 
				-    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,

			
 
				-    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,

			
 
				-    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,

			
 
				-    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,

			
 
				-    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,

			
 
				-    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,

			
 
				-    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,

			
 
				-    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,

			
 
				-    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,

			
 
				-    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,

			
 
				-    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,

			
 
				-    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,

			
 
				-    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,

			
 
				-    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,

			
 
				-    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,

			
 
				-    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,

			
 
				-    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,

			
 
				-    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,

			
 
				-    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,

			
 
				-    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,

			
 
				-    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,

			
 
				-    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,

			
 
				-    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,

			
 
				-    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,

			
 
				-    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,

			
 
				-    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,

			
 
				-    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,

			
 
				-    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,

			
 
				-    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,

			
 
				-    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,

			
 
				-    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,

			
 
				-    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,

			
 
				-    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,

			
 
				-    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,

			
 
				-    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,

			
 
				-    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,

			
 
				-    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,

			
 
				-    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,

			
 
				-    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,

			
 
				-    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,

			
 
				-    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,

			
 
				-    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,

			
 
				-    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,

			
 
				-    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,

			
 
				-    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,

			
 
				-    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,

			
 
				-    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,

			
 
				-    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,

			
 
				-    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,

			
 
				-    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,

			
 
				-    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,

			
 
				-    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,

			
 
				-    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,

			
 
				-    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,

			
 
				-    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,

			
 
				-    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,

			
 
				-    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,

			
 
				-    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,

			
 
				-    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,

			
 
				-    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,

			
 
				-    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,

			
 
				-    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,

			
 
				-    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,

			
 
				-    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,

			
 
				-    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,

			
 
				-    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,

			
 
				-    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,

			
 
				-    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,

			
 
				-    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,

			
 
				-    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,

			
 
				-    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,

			
 
				-    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,

			
 
				-    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,

			
 
				-    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,

			
 
				-    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,

			
 
				-    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,

			
 
				-    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,

			
 
				-    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,

			
 
				-    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,

			
 
				-    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,

			
 
				-    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,

			
 
				-    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,

			
 
				-    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,

			
 
				-    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,

			
 
				-    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,

			
 
				-    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,

			
 
				-    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,

			
 
				-    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,

			
 
				-    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,

			
 
				-    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,

			
 
				-    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,

			
 
				-    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,

			
 
				-    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,

			
 
				-    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,

			
 
				-    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,

			
 
				-    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,

			
 
				-    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,

			
 
				-    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,

			
 
				-    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,

			
 
				-    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,

			
 
				-    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,

			
 
				-    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,

			
 
				-    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,

			
 
				-    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,

			
 
				-    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,

			
 
				-    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,

			
 
				-    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,

			
 
				-    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,

			
 
				-    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,

			
 
				-    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,

			
 
				-    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,

			
 
				-    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,

			
 
				-    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,

			
 
				-    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,

			
 
				-    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,

			
 
				-    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,

			
 
				-    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,

			
 
				-    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,

			
 
				-    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,

			
 
				-    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,

			
 
				-    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,

			
 
				-    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,

			
 
				-    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,

			
 
				-    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,

			
 
				-    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,

			
 
				-    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,

			
 
				-    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,

			
 
				-    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,

			
 
				-    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,

			
 
				-    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,

			
 
				-    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,

			
 
				-    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,

			
 
				-    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,

			
 
				-    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,

			
 
				-GGML_TABLE_END()

			
 
				-#endif

			
 
				-

			
 
				-#endif // GGML_COMMON_IMPL

			
 
				-#endif // GGML_COMMON_IMPL

			
 
				+#ifndef GGML_COMMON_DECL
			
 
				+
			
 
				+#if defined(GGML_COMMON_DECL_C)
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+typedef uint16_t ggml_half;
			
 
				+typedef uint32_t ggml_half2;
			
 
				+
			
 
				+#define GGML_COMMON_AGGR
			
 
				+
			
 
				+#define GGML_COMMON_DECL
			
 
				+#elif defined(GGML_COMMON_DECL_METAL)
			
 
				+#include <metal_stdlib>
			
 
				+
			
 
				+typedef half  ggml_half;
			
 
				+typedef half2 ggml_half2;
			
 
				+
			
 
				+#define GGML_COMMON_AGGR
			
 
				+
			
 
				+#define GGML_COMMON_DECL
			
 
				+#elif defined(GGML_COMMON_DECL_CUDA)
			
 
				+#include <cuda_fp16.h>
			
 
				+#include <cstdint>
			
 
				+
			
 
				+typedef half  ggml_half;
			
 
				+typedef half2 ggml_half2;
			
 
				+
			
 
				+#define GGML_COMMON_AGGR data
			
 
				+
			
 
				+#define GGML_COMMON_DECL
			
 
				+#elif defined(GGML_COMMON_DECL_HIP)
			
 
				+#include <hip/hip_fp16.h>
			
 
				+#include <cstdint>
			
 
				+
			
 
				+typedef half  ggml_half;
			
 
				+typedef half2 ggml_half2;
			
 
				+
			
 
				+#define GGML_COMMON_AGGR data
			
 
				+
			
 
				+#define GGML_COMMON_DECL
			
 
				+#elif defined(GGML_COMMON_DECL_SYCL)
			
 
				+#include <sycl/half_type.hpp>
			
 
				+#include <cstdint>
			
 
				+
			
 
				+typedef sycl::half  ggml_half;
			
 
				+typedef sycl::half2 ggml_half2;
			
 
				+
			
 
				+#define GGML_COMMON_AGGR data
			
 
				+
			
 
				+#define GGML_COMMON_DECL
			
 
				+#endif
			
 
				+
			
 
				+#if defined(GGML_COMMON_DECL)
			
 
				+
			
 
				+#ifndef __cplusplus
			
 
				+#ifndef static_assert
			
 
				+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
			
 
				+#define static_assert(cond, msg) _Static_assert(cond, msg)
			
 
				+#else
			
 
				+#define static_assert(cond, msg) struct global_scope_noop_trick
			
 
				+#endif
			
 
				+#endif
			
 
				+#endif // __cplusplus
			
 
				+
			
 
				+// QK = number of values after dequantization
			
 
				+// QK_K = super-block size
			
 
				+
			
 
				+#ifdef GGML_QKK_64
			
 
				+#define QK_K 64
			
 
				+#define K_SCALE_SIZE 4
			
 
				+#else
			
 
				+#define QK_K 256
			
 
				+#define K_SCALE_SIZE 12
			
 
				+#endif // GGML_QKK_64
			
 
				+
			
 
				+#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
			
 
				+// QR = QK / number of values before dequantization
			
 
				+// QI = number of 32 bit integers before dequantization
			
 
				+
			
 
				+#define QI4_0 (QK4_0 / (4 * QR4_0))
			
 
				+#define QR4_0 2
			
 
				+
			
 
				+#define QI4_1 (QK4_1 / (4 * QR4_1))
			
 
				+#define QR4_1 2
			
 
				+
			
 
				+#define QI5_0 (QK5_0 / (4 * QR5_0))
			
 
				+#define QR5_0 2
			
 
				+
			
 
				+#define QI5_1 (QK5_1 / (4 * QR5_1))
			
 
				+#define QR5_1 2
			
 
				+
			
 
				+#define QI8_0 (QK8_0 / (4 * QR8_0))
			
 
				+#define QR8_0 1
			
 
				+
			
 
				+#define QI8_1 (QK8_1 / (4 * QR8_1))
			
 
				+#define QR8_1 1
			
 
				+
			
 
				+#define QI2_K (QK_K / (4*QR2_K))
			
 
				+#define QR2_K 4
			
 
				+
			
 
				+#define QI3_K (QK_K / (4*QR3_K))
			
 
				+#define QR3_K 4
			
 
				+
			
 
				+#define QI4_K (QK_K / (4*QR4_K))
			
 
				+#define QR4_K 2
			
 
				+
			
 
				+#define QI5_K (QK_K / (4*QR5_K))
			
 
				+#define QR5_K 2
			
 
				+
			
 
				+#define QI6_K (QK_K / (4*QR6_K))
			
 
				+#define QR6_K 2
			
 
				+
			
 
				+#define QI2_XXS (QK_K / (4*QR2_XXS))
			
 
				+#define QR2_XXS 8
			
 
				+
			
 
				+#define QI2_XS (QK_K / (4*QR2_XS))
			
 
				+#define QR2_XS 8
			
 
				+
			
 
				+#define QI2_S (QK_K / (4*QR2_S))
			
 
				+#define QR2_S 8
			
 
				+
			
 
				+#define QI3_XXS (QK_K / (4*QR3_XXS))
			
 
				+#define QR3_XXS 8
			
 
				+
			
 
				+#define QI3_XS (QK_K / (4*QR3_XS))
			
 
				+#define QR3_XS 8
			
 
				+
			
 
				+#define QI1_S (QK_K / (4*QR1_S))
			
 
				+#define QR1_S 8
			
 
				+
			
 
				+#define QI4_NL (QK4_NL / (4*QR4_NL))
			
 
				+#define QR4_NL 2
			
 
				+
			
 
				+#if QK_K == 64
			
 
				+#define QI4_XS QI4_NL
			
 
				+#define QR4_XS QR4_NL
			
 
				+#else
			
 
				+#define QI4_XS (QK_K / (4*QR4_XS))
			
 
				+#define QR4_XS 8
			
 
				+#endif
			
 
				+
			
 
				+#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
			
 
				+
			
 
				+#define QK4_0 32
			
 
				+typedef struct {
			
 
				+    ggml_half d;           // delta
			
 
				+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
			
 
				+} block_q4_0;
			
 
				+static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 block size/padding");
			
 
				+
			
 
				+#define QK4_1 32
			
 
				+typedef struct {
			
 
				+    union {
			
 
				+        struct {
			
 
				+            ggml_half d; // delta
			
 
				+            ggml_half m; // min
			
 
				+        } GGML_COMMON_AGGR;
			
 
				+        ggml_half2 dm;
			
 
				+    };
			
 
				+    uint8_t qs[QK4_1 / 2]; // nibbles / quants
			
 
				+} block_q4_1;
			
 
				+static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
			
 
				+
			
 
				+#define QK5_0 32
			
 
				+typedef struct {
			
 
				+    ggml_half d;           // delta
			
 
				+    uint8_t qh[4];         // 5-th bit of quants
			
 
				+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
			
 
				+} block_q5_0;
			
 
				+static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
			
 
				+
			
 
				+#define QK5_1 32
			
 
				+typedef struct {
			
 
				+    union {
			
 
				+        struct {
			
 
				+            ggml_half d; // delta
			
 
				+            ggml_half m; // min
			
 
				+        } GGML_COMMON_AGGR;
			
 
				+        ggml_half2 dm;
			
 
				+    };
			
 
				+    uint8_t qh[4];         // 5-th bit of quants
			
 
				+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
			
 
				+} block_q5_1;
			
 
				+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
			
 
				+
			
 
				+#define QK8_0 32
			
 
				+typedef struct {
			
 
				+    ggml_half d;       // delta
			
 
				+    int8_t  qs[QK8_0]; // quants
			
 
				+} block_q8_0;
			
 
				+static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block size/padding");
			
 
				+
			
 
				+#define QK8_1 32
			
 
				+typedef struct {
			
 
				+    union {
			
 
				+        struct {
			
 
				+            ggml_half d; // delta
			
 
				+            ggml_half s; // d * sum(qs[i])
			
 
				+        } GGML_COMMON_AGGR;
			
 
				+        ggml_half2 ds;
			
 
				+    };
			
 
				+    int8_t qs[QK8_1]; // quants
			
 
				+} block_q8_1;
			
 
				+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
			
 
				+
			
 
				+//
			
 
				+// Super-block quantization structures
			
 
				+//
			
 
				+
			
 
				+// 2-bit quantization
			
 
				+// weight is represented as x = a * q + b
			
 
				+// 16 blocks of 16 elements each
			
 
				+// Effectively 2.625 bits per weight
			
 
				+typedef struct {
			
 
				+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
			
 
				+    uint8_t qs[QK_K/4];      // quants
			
 
				+    union {
			
 
				+        struct {
			
 
				+            ggml_half d;    // super-block scale for quantized scales
			
 
				+            ggml_half dmin; // super-block scale for quantized mins
			
 
				+        } GGML_COMMON_AGGR;
			
 
				+        ggml_half2 dm;
			
 
				+    };
			
 
				+} block_q2_K;
			
 
				+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
			
 
				+
			
 
				+// 3-bit quantization
			
 
				+// weight is represented as x = a * q
			
 
				+// 16 blocks of 16 elements each
			
 
				+// Effectively 3.4375 bits per weight
			
 
				+#ifdef GGML_QKK_64
			
 
				+typedef struct {
			
 
				+    uint8_t hmask[QK_K/8]; // quants - high bit
			
 
				+    uint8_t qs[QK_K/4];    // quants - low 2 bits
			
 
				+    uint8_t scales[2];
			
 
				+    ggml_half d;           // super-block scale
			
 
				+} block_q3_K;
			
 
				+static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
			
 
				+#else
			
 
				+typedef struct {
			
 
				+    uint8_t hmask[QK_K/8]; // quants - high bit
			
 
				+    uint8_t qs[QK_K/4];    // quants - low 2 bits
			
 
				+    uint8_t scales[12];    // scales, quantized with 6 bits
			
 
				+    ggml_half d;           // super-block scale
			
 
				+} block_q3_K;
			
 
				+static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
			
 
				+#endif
			
 
				+
			
 
				+// 4-bit quantization
			
 
				+// 8 blocks of 32 elements each
			
 
				+// weight is represented as x = a * q + b
			
 
				+// Effectively 4.5 bits per weight
			
 
				+#ifdef GGML_QKK_64
			
 
				+typedef struct {
			
 
				+    ggml_half d[2];     // super-block scales/mins
			
 
				+    uint8_t scales[2];  // 4-bit block scales/mins
			
 
				+    uint8_t qs[QK_K/2]; // 4--bit quants
			
 
				+} block_q4_K;
			
 
				+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
			
 
				+#else
			
 
				+typedef struct {
			
 
				+    union {
			
 
				+        struct {
			
 
				+            ggml_half d;    // super-block scale for quantized scales
			
 
				+            ggml_half dmin; // super-block scale for quantized mins
			
 
				+        } GGML_COMMON_AGGR;
			
 
				+        ggml_half2 dm;
			
 
				+    };
			
 
				+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
			
 
				+    uint8_t qs[QK_K/2];           // 4--bit quants
			
 
				+} block_q4_K;
			
 
				+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
			
 
				+#endif
			
 
				+
			
 
				+// 5-bit quantization
			
 
				+// 8 blocks of 32 elements each
			
 
				+// weight is represented as x = a * q + b
			
 
				+// Effectively 5.5 bits per weight
			
 
				+#ifdef GGML_QKK_64
			
 
				+typedef struct {
			
 
				+    ggml_half d;             // super-block scale
			
 
				+    int8_t  scales[QK_K/16]; // 8-bit block scales
			
 
				+    uint8_t qh[QK_K/8];      // quants, high bit
			
 
				+    uint8_t qs[QK_K/2];      // quants, low 4 bits
			
 
				+} block_q5_K;
			
 
				+static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
			
 
				+#else
			
 
				+typedef struct {
			
 
				+    union {
			
 
				+        struct {
			
 
				+            ggml_half d;    // super-block scale for quantized scales
			
 
				+            ggml_half dmin; // super-block scale for quantized mins
			
 
				+        } GGML_COMMON_AGGR;
			
 
				+        ggml_half2 dm;
			
 
				+    };
			
 
				+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
			
 
				+    uint8_t qh[QK_K/8];           // quants, high bit
			
 
				+    uint8_t qs[QK_K/2];           // quants, low 4 bits
			
 
				+} block_q5_K;
			
 
				+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
			
 
				+#endif
			
 
				+
			
 
				+// 6-bit quantization
			
 
				+// weight is represented as x = a * q
			
 
				+// 16 blocks of 16 elements each
			
 
				+// Effectively 6.5625 bits per weight
			
 
				+typedef struct {
			
 
				+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
			
 
				+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
			
 
				+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
			
 
				+    ggml_half d;             // super-block scale
			
 
				+} block_q6_K;
			
 
				+static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
			
 
				+
			
 
				+// This is only used for intermediate quantization and dot products
			
 
				+typedef struct {
			
 
				+    float   d;              // delta
			
 
				+    int8_t  qs[QK_K];       // quants
			
 
				+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
			
 
				+} block_q8_K;
			
 
				+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
			
 
				+
			
 
				+// (Almost) "true" 2-bit quantization.
			
 
				+// Due to the need to use blocks as per ggml design, it ends up using
			
 
				+// 2.0625 bpw because of the 16-bit scale for each block of 256.
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint16_t qs[QK_K/8];
			
 
				+} block_iq2_xxs;
			
 
				+static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
			
 
				+
			
 
				+// 2.3125 bpw quants
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint16_t qs[QK_K/8];
			
 
				+    uint8_t  scales[QK_K/32];
			
 
				+} block_iq2_xs;
			
 
				+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
			
 
				+
			
 
				+// 2.5625 bpw quants
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint8_t qs[QK_K/4];
			
 
				+    uint8_t qh[QK_K/32];
			
 
				+    uint8_t scales[QK_K/32];
			
 
				+} block_iq2_s;
			
 
				+static_assert(sizeof(block_iq2_s) == sizeof(ggml_half) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
			
 
				+
			
 
				+// (Almost) "true" 3-bit quantization.
			
 
				+// Due to the need to use blocks as per ggml design, it ends up using
			
 
				+// 3.0625 bpw because of the 16-bit scale for each block of 256.
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint8_t qs[3*QK_K/8];
			
 
				+} block_iq3_xxs;
			
 
				+static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
			
 
				+
			
 
				+// 3.4375 bpw
			
 
				+#if QK_K == 64
			
 
				+#define IQ3S_N_SCALE 2
			
 
				+#else
			
 
				+#define IQ3S_N_SCALE QK_K/64
			
 
				+#endif
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint8_t qs[QK_K/4];
			
 
				+    uint8_t qh[QK_K/32];
			
 
				+    uint8_t signs[QK_K/8];
			
 
				+    uint8_t scales[IQ3S_N_SCALE];
			
 
				+} block_iq3_s;
			
 
				+static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
			
 
				+
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint8_t  qs[QK_K/8];
			
 
				+    uint16_t qh[QK_K/32];
			
 
				+} block_iq1_s;
			
 
				+static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
			
 
				+
			
 
				+// 1.75 bpw
			
 
				+typedef struct {
			
 
				+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
			
 
				+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
			
 
				+#if QK_K == 64
			
 
				+    ggml_half d;
			
 
				+#endif
			
 
				+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
			
 
				+} block_iq1_m;
			
 
				+#if QK_K == 64
			
 
				+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
			
 
				+#else
			
 
				+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
			
 
				+#endif
			
 
				+
			
 
				+// Used by IQ1_M quants
			
 
				+typedef union {
			
 
				+    ggml_half f16;
			
 
				+    uint16_t  u16;
			
 
				+} iq1m_scale_t;
			
 
				+
			
 
				+// Non-linear quants
			
 
				+#define QK4_NL 32
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint8_t qs[QK4_NL/2];
			
 
				+} block_iq4_nl;
			
 
				+static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
			
 
				+
			
 
				+#if QK_K == 64
			
 
				+#define block_iq4_xs block_iq4_nl
			
 
				+#else
			
 
				+typedef struct {
			
 
				+    ggml_half d;
			
 
				+    uint16_t scales_h;
			
 
				+    uint8_t  scales_l[QK_K/64];
			
 
				+    uint8_t  qs[QK_K/2];
			
 
				+} block_iq4_xs;
			
 
				+static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
			
 
				+#endif
			
 
				+
			
 
				+#endif // GGML_COMMON_DECL
			
 
				+#endif // GGML_COMMON_DECL
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				+#ifndef GGML_COMMON_IMPL
			
 
				+
			
 
				+#if defined(GGML_COMMON_IMPL_C)
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
			
 
				+#define GGML_TABLE_END() };
			
 
				+
			
 
				+#define GGML_COMMON_IMPL
			
 
				+#elif defined(GGML_COMMON_IMPL_METAL)
			
 
				+#include <metal_stdlib>
			
 
				+
			
 
				+#define GGML_TABLE_BEGIN(type, name, size) static const constant type name[size] = {
			
 
				+#define GGML_TABLE_END() };
			
 
				+
			
 
				+#define GGML_COMMON_IMPL
			
 
				+#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
			
 
				+#include <cstdint>
			
 
				+
			
 
				+#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
			
 
				+#define GGML_TABLE_END() };
			
 
				+
			
 
				+#define GGML_COMMON_IMPL
			
 
				+#elif defined(GGML_COMMON_IMPL_SYCL)
			
 
				+
			
 
				+#include <cstdint>
			
 
				+
			
 
				+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
			
 
				+#define GGML_TABLE_END() };
			
 
				+
			
 
				+#define GGML_COMMON_IMPL
			
 
				+#endif
			
 
				+
			
 
				+#if defined(GGML_COMMON_IMPL)
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint8_t, kmask_iq2xs, 8)
			
 
				+    1, 2, 4, 8, 16, 32, 64, 128
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
			
 
				+      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
			
 
				+    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
			
 
				+    160,  33,  34, 163,  36, 165, 166,  39,  40, 169, 170,  43, 172,  45,  46, 175,
			
 
				+     48, 177, 178,  51, 180,  53,  54, 183, 184,  57,  58, 187,  60, 189, 190,  63,
			
 
				+    192,  65,  66, 195,  68, 197, 198,  71,  72, 201, 202,  75, 204,  77,  78, 207,
			
 
				+     80, 209, 210,  83, 212,  85,  86, 215, 216,  89,  90, 219,  92, 221, 222,  95,
			
 
				+     96, 225, 226,  99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
			
 
				+    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
			
 
				+    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
			
 
				+    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
			
 
				+    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
			
 
				+    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
			
 
				+    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
			
 
				+    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
			
 
				+    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
			
 
				+    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
			
 
				+    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
			
 
				+    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
			
 
				+    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
			
 
				+    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
			
 
				+    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
			
 
				+    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
			
 
				+    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
			
 
				+    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
			
 
				+    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
			
 
				+    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
			
 
				+    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
			
 
				+    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
			
 
				+    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
			
 
				+    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
			
 
				+    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
			
 
				+    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
			
 
				+    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
			
 
				+    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
			
 
				+    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
			
 
				+    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
			
 
				+    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
			
 
				+    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
			
 
				+    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
			
 
				+    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
			
 
				+GGML_TABLE_END()
			
 
				+//#endif
			
 
				+
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
			
 
				+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
			
 
				+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
			
 
				+    0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
			
 
				+    0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
			
 
				+    0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
			
 
				+    0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
			
 
				+    0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
			
 
				+    0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
			
 
				+    0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
			
 
				+    0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
			
 
				+    0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
			
 
				+    0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
			
 
				+    0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
			
 
				+    0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
			
 
				+    0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
			
 
				+    0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
			
 
				+    0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
			
 
				+    0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
			
 
				+    0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
			
 
				+    0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
			
 
				+    0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
			
 
				+    0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
			
 
				+    0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
			
 
				+    0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
			
 
				+    0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
			
 
				+    0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
			
 
				+    0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
			
 
				+    0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
			
 
				+    0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
			
 
				+    0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
			
 
				+    0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
			
 
				+    0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
			
 
				+    0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
			
 
				+    0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
			
 
				+    0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
			
 
				+    0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
			
 
				+    0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
			
 
				+    0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
			
 
				+    0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
			
 
				+    0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
			
 
				+    0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
			
 
				+    0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
			
 
				+    0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
			
 
				+    0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
			
 
				+    0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
			
 
				+    0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
			
 
				+    0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
			
 
				+    0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
			
 
				+    0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
			
 
				+    0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
			
 
				+    0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
			
 
				+    0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
			
 
				+    0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
			
 
				+    0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
			
 
				+    0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
			
 
				+    0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
			
 
				+    0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
			
 
				+    0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
			
 
				+    0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
			
 
				+    0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
			
 
				+    0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
			
 
				+    0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
			
 
				+    0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
			
 
				+    0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint64_t, iq2xs_grid, 512)
			
 
				+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
			
 
				+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
			
 
				+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
			
 
				+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
			
 
				+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
			
 
				+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
			
 
				+    0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
			
 
				+    0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
			
 
				+    0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
			
 
				+    0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
			
 
				+    0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
			
 
				+    0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
			
 
				+    0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
			
 
				+    0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
			
 
				+    0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
			
 
				+    0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
			
 
				+    0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
			
 
				+    0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
			
 
				+    0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
			
 
				+    0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
			
 
				+    0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
			
 
				+    0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
			
 
				+    0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
			
 
				+    0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
			
 
				+    0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
			
 
				+    0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
			
 
				+    0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
			
 
				+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
			
 
				+    0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
			
 
				+    0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
			
 
				+    0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
			
 
				+    0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
			
 
				+    0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
			
 
				+    0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
			
 
				+    0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
			
 
				+    0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
			
 
				+    0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
			
 
				+    0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
			
 
				+    0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
			
 
				+    0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
			
 
				+    0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
			
 
				+    0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
			
 
				+    0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
			
 
				+    0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
			
 
				+    0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
			
 
				+    0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
			
 
				+    0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
			
 
				+    0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
			
 
				+    0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
			
 
				+    0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
			
 
				+    0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
			
 
				+    0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
			
 
				+    0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
			
 
				+    0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
			
 
				+    0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
			
 
				+    0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
			
 
				+    0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
			
 
				+    0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
			
 
				+    0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
			
 
				+    0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
			
 
				+    0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
			
 
				+    0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
			
 
				+    0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
			
 
				+    0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
			
 
				+    0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
			
 
				+    0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
			
 
				+    0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
			
 
				+    0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
			
 
				+    0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
			
 
				+    0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
			
 
				+    0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
			
 
				+    0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
			
 
				+    0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
			
 
				+    0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
			
 
				+    0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
			
 
				+    0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
			
 
				+    0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
			
 
				+    0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
			
 
				+    0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
			
 
				+    0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
			
 
				+    0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
			
 
				+    0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
			
 
				+    0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
			
 
				+    0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
			
 
				+    0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
			
 
				+    0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
			
 
				+    0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
			
 
				+    0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
			
 
				+    0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
			
 
				+    0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
			
 
				+    0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
			
 
				+    0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
			
 
				+    0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
			
 
				+    0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
			
 
				+    0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
			
 
				+    0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
			
 
				+    0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
			
 
				+    0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
			
 
				+    0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
			
 
				+    0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
			
 
				+    0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
			
 
				+    0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
			
 
				+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
			
 
				+    0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
			
 
				+    0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
			
 
				+    0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
			
 
				+    0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
			
 
				+    0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
			
 
				+    0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
			
 
				+    0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
			
 
				+    0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
			
 
				+    0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
			
 
				+    0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
			
 
				+    0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
			
 
				+    0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
			
 
				+    0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
			
 
				+    0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
			
 
				+    0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
			
 
				+    0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
			
 
				+    0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
			
 
				+    0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
			
 
				+    0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
			
 
				+    0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
			
 
				+    0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
			
 
				+    0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
			
 
				+    0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
			
 
				+    0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
			
 
				+    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint64_t, iq2s_grid, 1024)
			
 
				+    0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
			
 
				+    0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
			
 
				+    0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
			
 
				+    0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
			
 
				+    0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
			
 
				+    0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
			
 
				+    0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
			
 
				+    0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
			
 
				+    0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
			
 
				+    0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
			
 
				+    0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
			
 
				+    0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
			
 
				+    0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
			
 
				+    0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
			
 
				+    0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
			
 
				+    0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
			
 
				+    0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
			
 
				+    0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
			
 
				+    0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
			
 
				+    0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
			
 
				+    0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
			
 
				+    0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
			
 
				+    0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
			
 
				+    0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
			
 
				+    0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
			
 
				+    0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
			
 
				+    0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
			
 
				+    0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
			
 
				+    0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
			
 
				+    0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
			
 
				+    0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
			
 
				+    0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
			
 
				+    0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
			
 
				+    0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
			
 
				+    0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
			
 
				+    0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
			
 
				+    0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
			
 
				+    0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
			
 
				+    0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
			
 
				+    0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
			
 
				+    0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
			
 
				+    0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
			
 
				+    0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
			
 
				+    0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
			
 
				+    0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
			
 
				+    0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
			
 
				+    0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
			
 
				+    0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
			
 
				+    0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
			
 
				+    0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
			
 
				+    0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
			
 
				+    0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
			
 
				+    0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
			
 
				+    0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
			
 
				+    0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
			
 
				+    0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
			
 
				+    0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
			
 
				+    0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
			
 
				+    0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
			
 
				+    0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
			
 
				+    0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
			
 
				+    0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
			
 
				+    0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
			
 
				+    0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
			
 
				+    0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
			
 
				+    0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
			
 
				+    0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
			
 
				+    0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
			
 
				+    0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
			
 
				+    0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
			
 
				+    0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
			
 
				+    0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
			
 
				+    0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
			
 
				+    0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
			
 
				+    0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
			
 
				+    0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
			
 
				+    0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
			
 
				+    0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
			
 
				+    0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
			
 
				+    0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
			
 
				+    0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
			
 
				+    0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
			
 
				+    0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
			
 
				+    0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
			
 
				+    0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
			
 
				+    0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
			
 
				+    0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
			
 
				+    0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
			
 
				+    0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
			
 
				+    0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
			
 
				+    0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
			
 
				+    0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
			
 
				+    0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
			
 
				+    0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
			
 
				+    0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
			
 
				+    0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
			
 
				+    0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
			
 
				+    0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
			
 
				+    0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
			
 
				+    0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
			
 
				+    0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
			
 
				+    0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
			
 
				+    0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
			
 
				+    0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
			
 
				+    0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
			
 
				+    0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
			
 
				+    0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
			
 
				+    0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
			
 
				+    0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
			
 
				+    0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
			
 
				+    0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
			
 
				+    0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
			
 
				+    0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
			
 
				+    0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
			
 
				+    0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
			
 
				+    0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
			
 
				+    0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
			
 
				+    0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
			
 
				+    0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
			
 
				+    0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
			
 
				+    0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
			
 
				+    0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
			
 
				+    0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
			
 
				+    0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
			
 
				+    0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
			
 
				+    0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
			
 
				+    0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
			
 
				+    0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
			
 
				+    0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
			
 
				+    0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
			
 
				+    0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
			
 
				+    0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
			
 
				+    0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
			
 
				+    0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
			
 
				+    0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
			
 
				+    0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
			
 
				+    0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
			
 
				+    0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
			
 
				+    0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
			
 
				+    0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
			
 
				+    0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
			
 
				+    0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
			
 
				+    0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
			
 
				+    0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
			
 
				+    0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
			
 
				+    0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
			
 
				+    0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
			
 
				+    0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
			
 
				+    0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
			
 
				+    0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
			
 
				+    0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
			
 
				+    0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
			
 
				+    0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
			
 
				+    0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
			
 
				+    0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
			
 
				+    0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
			
 
				+    0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
			
 
				+    0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
			
 
				+    0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
			
 
				+    0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
			
 
				+    0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
			
 
				+    0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
			
 
				+    0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
			
 
				+    0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
			
 
				+    0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
			
 
				+    0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
			
 
				+    0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
			
 
				+    0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
			
 
				+    0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
			
 
				+    0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
			
 
				+    0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
			
 
				+    0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
			
 
				+    0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
			
 
				+    0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
			
 
				+    0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
			
 
				+    0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
			
 
				+    0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
			
 
				+    0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
			
 
				+    0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
			
 
				+    0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
			
 
				+    0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
			
 
				+    0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
			
 
				+    0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
			
 
				+    0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
			
 
				+    0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
			
 
				+    0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
			
 
				+    0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
			
 
				+    0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
			
 
				+    0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
			
 
				+    0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
			
 
				+    0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
			
 
				+    0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
			
 
				+    0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
			
 
				+    0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
			
 
				+    0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
			
 
				+    0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
			
 
				+    0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
			
 
				+    0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
			
 
				+    0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
			
 
				+    0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
			
 
				+    0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
			
 
				+    0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
			
 
				+    0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
			
 
				+    0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
			
 
				+    0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
			
 
				+    0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
			
 
				+    0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
			
 
				+    0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
			
 
				+    0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
			
 
				+    0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
			
 
				+    0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
			
 
				+    0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
			
 
				+    0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
			
 
				+    0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
			
 
				+    0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
			
 
				+    0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
			
 
				+    0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
			
 
				+    0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
			
 
				+    0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
			
 
				+    0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
			
 
				+    0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
			
 
				+    0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
			
 
				+    0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
			
 
				+    0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
			
 
				+    0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
			
 
				+    0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
			
 
				+    0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
			
 
				+    0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
			
 
				+    0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
			
 
				+    0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
			
 
				+    0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
			
 
				+    0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
			
 
				+    0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
			
 
				+    0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
			
 
				+    0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
			
 
				+    0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
			
 
				+    0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
			
 
				+    0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
			
 
				+    0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
			
 
				+    0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
			
 
				+    0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
			
 
				+    0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
			
 
				+    0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
			
 
				+    0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
			
 
				+    0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
			
 
				+    0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
			
 
				+    0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
			
 
				+    0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
			
 
				+    0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
			
 
				+    0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
			
 
				+    0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
			
 
				+    0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
			
 
				+    0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
			
 
				+    0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
			
 
				+    0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
			
 
				+    0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint32_t, iq3xxs_grid, 256)
			
 
				+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
			
 
				+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
			
 
				+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
			
 
				+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
			
 
				+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
			
 
				+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
			
 
				+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
			
 
				+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
			
 
				+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
			
 
				+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
			
 
				+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
			
 
				+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
			
 
				+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
			
 
				+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
			
 
				+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
			
 
				+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
			
 
				+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
			
 
				+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
			
 
				+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
			
 
				+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
			
 
				+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
			
 
				+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
			
 
				+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
			
 
				+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
			
 
				+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
			
 
				+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
			
 
				+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
			
 
				+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
			
 
				+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
			
 
				+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
			
 
				+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
			
 
				+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
			
 
				+    0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
			
 
				+    0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
			
 
				+    0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
			
 
				+    0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
			
 
				+    0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
			
 
				+    0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
			
 
				+    0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
			
 
				+    0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
			
 
				+    0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
			
 
				+    0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
			
 
				+    0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
			
 
				+    0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
			
 
				+    0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
			
 
				+    0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
			
 
				+    0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
			
 
				+    0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
			
 
				+    0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
			
 
				+    0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
			
 
				+    0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
			
 
				+    0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
			
 
				+    0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
			
 
				+    0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
			
 
				+    0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
			
 
				+    0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
			
 
				+    0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
			
 
				+    0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
			
 
				+    0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
			
 
				+    0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
			
 
				+    0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
			
 
				+    0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
			
 
				+    0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
			
 
				+    0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
			
 
				+    0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
			
 
				+    0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
			
 
				+    0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
			
 
				+    0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
			
 
				+    0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
			
 
				+    0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
			
 
				+    0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
			
 
				+    0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
			
 
				+    0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
			
 
				+    0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
			
 
				+    0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
			
 
				+    0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
			
 
				+    0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
			
 
				+    0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
			
 
				+    0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
			
 
				+    0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
			
 
				+    0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
			
 
				+    0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
			
 
				+    0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
			
 
				+    0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
			
 
				+    0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
			
 
				+    0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
			
 
				+    0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
			
 
				+    0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
			
 
				+    0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
			
 
				+    0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
			
 
				+    0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
			
 
				+    0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
			
 
				+    0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
			
 
				+    0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
			
 
				+    0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
			
 
				+    0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
			
 
				+GGML_TABLE_END()
			
 
				+
			
 
				+#define NGRID_IQ1S 2048
			
 
				+#define IQ1S_DELTA 0.125f
			
 
				+#define IQ1M_DELTA 0.125f
			
 
				+#if defined(GGML_COMMON_IMPL_C)
			
 
				+GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
			
 
				+    0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
			
 
				+    0xffffffffffff0101, 0xffffffffff00ff00, 0xffffffffff000000, 0xffffffffff01ffff,
			
 
				+    0xffffffffff01ff01, 0xffffffffff0101ff, 0xffffffffff010101, 0xffffffff00ff0000,
			
 
				+    0xffffffff0000ff00, 0xffffffff000000ff, 0xffffffff00000001, 0xffffffff00010000,
			
 
				+    0xffffffff01ffffff, 0xffffffff01ffff01, 0xffffffff01ff01ff, 0xffffffff01ff0101,
			
 
				+    0xffffffff01000000, 0xffffffff0101ffff, 0xffffffff0101ff01, 0xffffffff010101ff,
			
 
				+    0xffffffff01010101, 0xffffff00ffff00ff, 0xffffff00ffff0000, 0xffffff00ff00ff00,
			
 
				+    0xffffff00ff0000ff, 0xffffff00ff000001, 0xffffff00ff000100, 0xffffff00ff000101,
			
 
				+    0xffffff00ff010000, 0xffffff0000ffff00, 0xffffff0000ff0001, 0xffffff0000ff0100,
			
 
				+    0xffffff000000ff01, 0xffffff0000000000, 0xffffff0000000101, 0xffffff000001ff00,
			
 
				+    0xffffff00000100ff, 0xffffff0000010001, 0xffffff00000101ff, 0xffffff0001ff0000,
			
 
				+    0xffffff000100ff00, 0xffffff00010000ff, 0xffffff0001000001, 0xffffff0001010000,
			
 
				+    0xffffff01ffffffff, 0xffffff01ffffff01, 0xffffff01ffff01ff, 0xffffff01ffff0101,
			
 
				+    0xffffff01ff000000, 0xffffff01ff01ffff, 0xffffff01ff01ff01, 0xffffff01ff0101ff,
			
 
				+    0xffffff01ff010101, 0xffffff0100ff0000, 0xffffff010000ff00, 0xffffff0100000100,
			
 
				+    0xffffff01000100ff, 0xffffff0100010100, 0xffffff0101ffffff, 0xffffff0101ffff01,
			
 
				+    0xffffff0101ff01ff, 0xffffff0101ff0101, 0xffffff010100ff00, 0xffffff0101000000,
			
 
				+    0xffffff0101000100, 0xffffff010101ffff, 0xffffff010101ff01, 0xffffff01010101ff,
			
 
				+    0xffffff0101010101, 0xffff00ffff00ff00, 0xffff00ffff0000ff, 0xffff00ffff000001,
			
 
				+    0xffff00ffff010000, 0xffff00ff00ffff00, 0xffff00ff00ff0100, 0xffff00ff00000000,
			
 
				+    0xffff00ff00000101, 0xffff00ff000100ff, 0xffff00ff00010000, 0xffff00ff0100ff00,
			
 
				+    0xffff00ff01000100, 0xffff00ff01010000, 0xffff0000ffffff00, 0xffff0000ffff00ff,
			
 
				+    0xffff0000ffff0000, 0xffff0000ffff0001, 0xffff0000ff000000, 0xffff0000ff0001ff,
			
 
				+    0xffff0000ff000101, 0xffff0000ff010100, 0xffff000000ffffff, 0xffff000000ff0000,
			
 
				+    0xffff000000ff0101, 0xffff00000000ffff, 0xffff00000000ff00, 0xffff0000000000ff,
			
 
				+    0xffff000000000000, 0xffff000000000001, 0xffff000000000100, 0xffff00000001ffff,
			
 
				+    0xffff00000001ff01, 0xffff000000010000, 0xffff0000000101ff, 0xffff000000010101,
			
 
				+    0xffff000001ffff00, 0xffff00000100ff00, 0xffff000001000000, 0xffff0000010001ff,
			
 
				+    0xffff000001000101, 0xffff00000101ff00, 0xffff0000010100ff, 0xffff000001010000,
			
 
				+    0xffff000001010001, 0xffff000001010100, 0xffff0001ff0000ff, 0xffff0001ff000100,
			
 
				+    0xffff000100ffff00, 0xffff000100ff00ff, 0xffff00010000ffff, 0xffff00010000ff01,
			
 
				+    0xffff000100000000, 0xffff0001000001ff, 0xffff00010001ffff, 0xffff00010001ff00,
			
 
				+    0xffff000100010001, 0xffff000100010100, 0xffff000101ff0000, 0xffff00010100ff00,
			
 
				+    0xffff0001010000ff, 0xffff000101000100, 0xffff01ffffffffff, 0xffff01ffffffff01,
			
 
				+    0xffff01ffffff01ff, 0xffff01ffffff0101, 0xffff01ffff000000, 0xffff01ffff01ffff,
			
 
				+    0xffff01ffff01ff01, 0xffff01ffff0101ff, 0xffff01ffff010101, 0xffff01ff00ff0000,
			
 
				+    0xffff01ff0000ff00, 0xffff01ff00000001, 0xffff01ff00010000, 0xffff01ff01ffffff,
			
 
				+    0xffff01ff01ffff01, 0xffff01ff01ff01ff, 0xffff01ff01ff0101, 0xffff01ff01000000,
			
 
				+    0xffff01ff0101ffff, 0xffff01ff0101ff01, 0xffff01ff010101ff, 0xffff01ff01010101,
			
 
				+    0xffff0100ffff0000, 0xffff0100ff00ff00, 0xffff0100ff0000ff, 0xffff0100ff000100,
			
 
				+    0xffff0100ff0100ff, 0xffff0100ff010000, 0xffff010000ffff00, 0xffff01000000ffff,
			
 
				+    0xffff01000000ff00, 0xffff010000000000, 0xffff01000001ff00, 0xffff0100000100ff,
			
 
				+    0xffff010000010100, 0xffff01000100ff00, 0xffff0100010000ff, 0xffff010001000001,
			
 
				+    0xffff010001000100, 0xffff010001010000, 0xffff0101ffffffff, 0xffff0101ffffff01,
			
 
				+    0xffff0101ffff01ff, 0xffff0101ffff0101, 0xffff0101ff000000, 0xffff0101ff01ffff,
			
 
				+    0xffff0101ff01ff01, 0xffff0101ff0101ff, 0xffff0101ff010101, 0xffff010100ff0000,
			
 
				+    0xffff01010000ff00, 0xffff010100000100, 0xffff01010001ff00, 0xffff010100010000,
			
 
				+    0xffff010101ffffff, 0xffff010101ffff01, 0xffff010101ff0000, 0xffff010101ff01ff,
			
 
				+    0xffff010101ff0101, 0xffff010101000000, 0xffff01010101ffff, 0xffff01010101ff01,
			
 
				+    0xffff0101010101ff, 0xffff010101010101, 0xff00ffffff00ffff, 0xff00ffffff00ff00,
			
 
				+    0xff00ffffff0000ff, 0xff00ffffff000100, 0xff00ffffff0100ff, 0xff00ffffff010000,
			
 
				+    0xff00ffff00ffff00, 0xff00ffff00ff00ff, 0xff00ffff0000ffff, 0xff00ffff00000000,
			
 
				+    0xff00ffff000001ff, 0xff00ffff0001ff00, 0xff00ffff000100ff, 0xff00ffff00010000,
			
 
				+    0xff00ffff00010100, 0xff00ffff0100ff00, 0xff00ffff010000ff, 0xff00ffff01000001,
			
 
				+    0xff00ffff0101ff00, 0xff00ffff01010000, 0xff00ff00ffffff00, 0xff00ff00ffff00ff,
			
 
				+    0xff00ff00ffff0001, 0xff00ff00ffff0100, 0xff00ff00ff00ffff, 0xff00ff00ff00ff01,
			
 
				+    0xff00ff00ff000000, 0xff00ff00ff0001ff, 0xff00ff00ff01ff00, 0xff00ff00ff0100ff,
			
 
				+    0xff00ff00ff010100, 0xff00ff0000ff0000, 0xff00ff0000ff0101, 0xff00ff000000ffff,
			
 
				+    0xff00ff000000ff00, 0xff00ff000000ff01, 0xff00ff00000000ff, 0xff00ff0000000000,
			
 
				+    0xff00ff0000000001, 0xff00ff0000000100, 0xff00ff000001ffff, 0xff00ff0000010000,
			
 
				+    0xff00ff0001ff00ff, 0xff00ff000100ff01, 0xff00ff0001000000, 0xff00ff000101ff00,
			
 
				+    0xff00ff00010100ff, 0xff00ff01ff00ff00, 0xff00ff01ff0000ff, 0xff00ff01ff000001,
			
 
				+    0xff00ff01ff010000, 0xff00ff0100ffffff, 0xff00ff0100ff0001, 0xff00ff0100ff0100,
			
 
				+    0xff00ff010000ff01, 0xff00ff0100000000, 0xff00ff01000001ff, 0xff00ff0100000101,
			
 
				+    0xff00ff01000100ff, 0xff00ff0100010001, 0xff00ff0101ff0000, 0xff00ff010100ff00,
			
 
				+    0xff00ff01010000ff, 0xff00ff0101000001, 0xff00ff0101010000, 0xff0000ffffffff00,
			
 
				+    0xff0000ffffff0001, 0xff0000ffffff0100, 0xff0000ffff0000ff, 0xff0000ffff000000,
			
 
				+    0xff0000ffff0001ff, 0xff0000ffff000100, 0xff0000ffff01ff00, 0xff0000ffff010001,
			
 
				+    0xff0000ff00ffff00, 0xff0000ff00ff0000, 0xff0000ff00ff0001, 0xff0000ff00ff01ff,
			
 
				+    0xff0000ff00ff0101, 0xff0000ff0000ff00, 0xff0000ff000000ff, 0xff0000ff00000000,
			
 
				+    0xff0000ff00000001, 0xff0000ff00000100, 0xff0000ff0001ff01, 0xff0000ff00010000,
			
 
				+    0xff0000ff000101ff, 0xff0000ff01ff00ff, 0xff0000ff01ff0100, 0xff0000ff0100ffff,
			
 
				+    0xff0000ff010000ff, 0xff0000ff01000000, 0xff0000ff010001ff, 0xff0000ff01000100,
			
 
				+    0xff0000ff01000101, 0xff0000ff0101ff00, 0xff0000ff010100ff, 0xff0000ff01010000,
			
 
				+    0xff0000ff01010100, 0xff000000ffffff01, 0xff000000ffff0000, 0xff000000ffff0101,
			
 
				+    0xff000000ff00ff00, 0xff000000ff0000ff, 0xff000000ff000000, 0xff000000ff000001,
			
 
				+    0xff000000ff000100, 0xff000000ff01ffff, 0xff000000ff01ff01, 0xff000000ff010000,
			
 
				+    0xff000000ff0101ff, 0xff000000ff010101, 0xff00000000ffff00, 0xff00000000ff00ff,
			
 
				+    0xff00000000ff0000, 0xff00000000ff0001, 0xff0000000000ff00, 0xff0000000000ff01,
			
 
				+    0xff000000000000ff, 0xff00000000000000, 0xff00000000000001, 0xff00000000000100,
			
 
				+    0xff00000000000101, 0xff0000000001ff00, 0xff000000000100ff, 0xff00000000010000,
			
 
				+    0xff00000000010001, 0xff00000000010100, 0xff00000001ffffff, 0xff00000001ffff01,
			
 
				+    0xff00000001ff00ff, 0xff00000001ff0000, 0xff00000001ff01ff, 0xff00000001ff0101,
			
 
				+    0xff0000000100ffff, 0xff0000000100ff00, 0xff000000010000ff, 0xff00000001000000,
			
 
				+    0xff00000001000001, 0xff00000001000100, 0xff00000001000101, 0xff0000000101ffff,
			
 
				+    0xff0000000101ff01, 0xff00000001010000, 0xff000001ffffff00, 0xff000001ffff00ff,
			
 
				+    0xff000001ffff0000, 0xff000001ffff0001, 0xff000001ff000000, 0xff000001ff000001,
			
 
				+    0xff000001ff0001ff, 0xff000001ff000101, 0xff000001ff01ff00, 0xff000001ff010001,
			
 
				+    0xff00000100ffffff, 0xff00000100ffff01, 0xff00000100ff00ff, 0xff00000100ff0000,
			
 
				+    0xff00000100ff01ff, 0xff00000100ff0101, 0xff0000010000ff00, 0xff00000100000000,
			
 
				+    0xff00000100000001, 0xff000001000001ff, 0xff00000100000100, 0xff0000010001ff00,
			
 
				+    0xff000001000100ff, 0xff00000100010000, 0xff000001000101ff, 0xff00000100010100,
			
 
				+    0xff00000100010101, 0xff00000101ff0001, 0xff00000101ff0101, 0xff0000010100ff01,
			
 
				+    0xff00000101000000, 0xff000001010100ff, 0xff00000101010100, 0xff0001ffff00ff00,
			
 
				+    0xff0001ffff000001, 0xff0001ffff010000, 0xff0001ff00ffff00, 0xff0001ff00ff00ff,
			
 
				+    0xff0001ff00ff0001, 0xff0001ff00ff0100, 0xff0001ff0000ffff, 0xff0001ff00000000,
			
 
				+    0xff0001ff000001ff, 0xff0001ff00000101, 0xff0001ff0001ffff, 0xff0001ff0001ff00,
			
 
				+    0xff0001ff000100ff, 0xff0001ff00010001, 0xff0001ff00010100, 0xff0001ff01ff0000,
			
 
				+    0xff0001ff0100ff00, 0xff0001ff010000ff, 0xff0001ff01010000, 0xff000100ff00ffff,
			
 
				+    0xff000100ff00ff01, 0xff000100ff000000, 0xff000100ff000101, 0xff000100ff01ff00,
			
 
				+    0xff000100ff010000, 0xff00010000ffff01, 0xff00010000ff00ff, 0xff00010000ff0000,
			
 
				+    0xff00010000ff01ff, 0xff0001000000ff00, 0xff000100000000ff, 0xff00010000000000,
			
 
				+    0xff00010000000001, 0xff00010000000100, 0xff00010000000101, 0xff0001000001ffff,
			
 
				+    0xff00010000010000, 0xff00010000010101, 0xff00010001ff0100, 0xff0001000100ff00,
			
 
				+    0xff0001000100ff01, 0xff00010001000000, 0xff000100010001ff, 0xff0001000101ff00,
			
 
				+    0xff00010001010001, 0xff00010001010100, 0xff000101ffff0100, 0xff000101ff000001,
			
 
				+    0xff000101ff0100ff, 0xff000101ff010001, 0xff00010100ff00ff, 0xff00010100ff0001,
			
 
				+    0xff00010100ff0100, 0xff0001010000ffff, 0xff0001010000ff01, 0xff00010100000000,
			
 
				+    0xff000101000001ff, 0xff0001010001ff00, 0xff00010100010001, 0xff00010100010100,
			
 
				+    0xff00010101ff0000, 0xff0001010100ff00, 0xff00010101000001, 0xff00010101000101,
			
 
				+    0xff01ffffffffffff, 0xff01ffffffffff01, 0xff01ffffffff01ff, 0xff01ffffffff0101,
			
 
				+    0xff01ffffff000000, 0xff01ffffff01ffff, 0xff01ffffff01ff01, 0xff01ffffff010000,
			
 
				+    0xff01ffffff0101ff, 0xff01ffffff010101, 0xff01ffff00ff0000, 0xff01ffff0000ff00,
			
 
				+    0xff01ffff00000100, 0xff01ffff0001ff00, 0xff01ffff00010000, 0xff01ffff01ffffff,
			
 
				+    0xff01ffff01ffff01, 0xff01ffff01ff01ff, 0xff01ffff01ff0101, 0xff01ffff01000000,
			
 
				+    0xff01ffff0101ffff, 0xff01ffff0101ff01, 0xff01ffff01010000, 0xff01ffff010101ff,
			
 
				+    0xff01ffff01010101, 0xff01ff00ffff0000, 0xff01ff00ff00ff00, 0xff01ff00ff0000ff,
			
 
				+    0xff01ff00ff000100, 0xff01ff00ff010000, 0xff01ff0000ffff01, 0xff01ff0000ff00ff,
			
 
				+    0xff01ff0000ff0100, 0xff01ff0000000000, 0xff01ff00000001ff, 0xff01ff0000000101,
			
 
				+    0xff01ff000001ff00, 0xff01ff00000100ff, 0xff01ff0000010000, 0xff01ff0000010001,
			
 
				+    0xff01ff0001ff0000, 0xff01ff000100ffff, 0xff01ff0001000001, 0xff01ff0001000100,
			
 
				+    0xff01ff0001010000, 0xff01ff01ffffff00, 0xff01ff01ffff01ff, 0xff01ff01ffff0101,
			
 
				+    0xff01ff01ff00ff00, 0xff01ff01ff000000, 0xff01ff01ff01ffff, 0xff01ff01ff01ff01,
			
 
				+    0xff01ff01ff0101ff, 0xff01ff01ff010101, 0xff01ff0100ff0000, 0xff01ff010000ff00,
			
 
				+    0xff01ff0100000001, 0xff01ff0100000100, 0xff01ff0100010000, 0xff01ff0101ffff00,
			
 
				+    0xff01ff0101ff01ff, 0xff01ff0101ff0101, 0xff01ff010100ff00, 0xff01ff0101000000,
			
 
				+    0xff01ff010101ffff, 0xff01ff010101ff01, 0xff01ff01010101ff, 0xff01ff0101010101,
			
 
				+    0xff0100ffffff0000, 0xff0100ffff0000ff, 0xff0100ffff000001, 0xff0100ffff000100,
			
 
				+    0xff0100ffff010000, 0xff0100ff00ff00ff, 0xff0100ff00ff0000, 0xff0100ff00ff0001,
			
 
				+    0xff0100ff00ff0100, 0xff0100ff0000ff01, 0xff0100ff00000000, 0xff0100ff000001ff,
			
 
				+    0xff0100ff00000101, 0xff0100ff00010001, 0xff0100ff01ff0000, 0xff0100ff0100ff00,
			
 
				+    0xff0100ff010000ff, 0xff0100ff01000100, 0xff0100ff0101ff00, 0xff0100ff01010000,
			
 
				+    0xff010000ffff0100, 0xff010000ff000000, 0xff010000ff01ff00, 0xff010000ff010100,
			
 
				+    0xff01000000ffffff, 0xff01000000ff0000, 0xff01000000ff01ff, 0xff0100000000ff00,
			
 
				+    0xff010000000000ff, 0xff01000000000000, 0xff01000000000100, 0xff0100000001ff01,
			
 
				+    0xff01000000010000, 0xff010000000101ff, 0xff01000001ff0100, 0xff0100000100ffff,
			
 
				+    0xff010000010000ff, 0xff01000001000000, 0xff010000010001ff, 0xff01000001000101,
			
 
				+    0xff0100000101ff00, 0xff010000010100ff, 0xff01000001010001, 0xff01000001010100,
			
 
				+    0xff010001ffff0000, 0xff010001ff00ffff, 0xff010001ff00ff01, 0xff010001ff000100,
			
 
				+    0xff010001ff010000, 0xff01000100ffff00, 0xff01000100ff0100, 0xff01000100000000,
			
 
				+    0xff0100010001ffff, 0xff0100010001ff00, 0xff01000100010100, 0xff01000101ff00ff,
			
 
				+    0xff01000101ff0001, 0xff0100010100ffff, 0xff01000101000101, 0xff0101ffffffffff,
			
 
				+    0xff0101ffffffff01, 0xff0101ffffff01ff, 0xff0101ffffff0101, 0xff0101ffff000000,
			
 
				+    0xff0101ffff01ffff, 0xff0101ffff01ff01, 0xff0101ffff0101ff, 0xff0101ffff010101,
			
 
				+    0xff0101ff00ff0000, 0xff0101ff0000ff00, 0xff0101ff000000ff, 0xff0101ff00010000,
			
 
				+    0xff0101ff01ffffff, 0xff0101ff01ffff01, 0xff0101ff01ff01ff, 0xff0101ff01ff0101,
			
 
				+    0xff0101ff0101ffff, 0xff0101ff0101ff01, 0xff0101ff010101ff, 0xff0101ff01010101,
			
 
				+    0xff010100ffff0100, 0xff010100ff00ff00, 0xff010100ff0000ff, 0xff010100ff000100,
			
 
				+    0xff010100ff010000, 0xff01010000ff0001, 0xff01010000ff0100, 0xff0101000000ff01,
			
 
				+    0xff01010000000000, 0xff0101000001ff00, 0xff010100000100ff, 0xff01010000010001,
			
 
				+    0xff01010000010100, 0xff01010001ff0000, 0xff0101000100ffff, 0xff01010001000001,
			
 
				+    0xff01010001000100, 0xff010100010100ff, 0xff01010001010000, 0xff010101ffffffff,
			
 
				+    0xff010101ffffff01, 0xff010101ffff01ff, 0xff010101ffff0101, 0xff010101ff01ffff,
			
 
				+    0xff010101ff01ff01, 0xff010101ff0101ff, 0xff010101ff010101, 0xff01010100ff0000,
			
 
				+    0xff0101010000ff00, 0xff01010100000001, 0xff01010100000100, 0xff01010100010000,
			
 
				+    0xff01010101ffffff, 0xff01010101ffff01, 0xff01010101ff01ff, 0xff01010101ff0101,
			
 
				+    0xff01010101000000, 0xff0101010101ffff, 0xff0101010101ff01, 0xff010101010101ff,
			
 
				+    0xff01010101010101, 0x00ffffffffff0000, 0x00ffffffff00ff00, 0x00ffffffff000001,
			
 
				+    0x00ffffffff010000, 0x00ffffff00ff0100, 0x00ffffff0000ff01, 0x00ffffff00000000,
			
 
				+    0x00ffffff000001ff, 0x00ffffff00000101, 0x00ffffff0001ff00, 0x00ffffff000100ff,
			
 
				+    0x00ffffff00010001, 0x00ffffff010000ff, 0x00ffffff01000100, 0x00ffffff0101ff00,
			
 
				+    0x00ffffff01010001, 0x00ffff00ffffffff, 0x00ffff00ffffff00, 0x00ffff00ffff00ff,
			
 
				+    0x00ffff00ffff0001, 0x00ffff00ffff0100, 0x00ffff00ff00ff01, 0x00ffff00ff000000,
			
 
				+    0x00ffff00ff000001, 0x00ffff00ff0001ff, 0x00ffff00ff000101, 0x00ffff00ff01ff00,
			
 
				+    0x00ffff00ff010001, 0x00ffff00ff010100, 0x00ffff0000ff0000, 0x00ffff0000ff01ff,
			
 
				+    0x00ffff0000ff0101, 0x00ffff000000ff00, 0x00ffff00000000ff, 0x00ffff0000000000,
			
 
				+    0x00ffff0000000001, 0x00ffff0000000100, 0x00ffff0000000101, 0x00ffff0000010000,
			
 
				+    0x00ffff00000101ff, 0x00ffff0000010101, 0x00ffff0001ffff00, 0x00ffff0001ff00ff,
			
 
				+    0x00ffff0001ff0001, 0x00ffff000100ffff, 0x00ffff000100ff01, 0x00ffff0001000000,
			
 
				+    0x00ffff000101ffff, 0x00ffff000101ff00, 0x00ffff000101ff01, 0x00ffff01ffff0000,
			
 
				+    0x00ffff01ff00ff00, 0x00ffff01ff0000ff, 0x00ffff01ff000001, 0x00ffff01ff010000,
			
 
				+    0x00ffff0100ffff00, 0x00ffff010000ff01, 0x00ffff0100000000, 0x00ffff0100000101,
			
 
				+    0x00ffff01000100ff, 0x00ffff0100010100, 0x00ffff0101ff0100, 0x00ffff01010000ff,
			
 
				+    0x00ffff0101010000, 0x00ff00ffffffff00, 0x00ff00ffff000000, 0x00ff00ffff000100,
			
 
				+    0x00ff00ffff010100, 0x00ff00ff00ff0000, 0x00ff00ff00ff01ff, 0x00ff00ff00ff0101,
			
 
				+    0x00ff00ff0000ff00, 0x00ff00ff000000ff, 0x00ff00ff00000000, 0x00ff00ff00000001,
			
 
				+    0x00ff00ff0001ff00, 0x00ff00ff0001ff01, 0x00ff00ff00010000, 0x00ff00ff000101ff,
			
 
				+    0x00ff00ff00010101, 0x00ff00ff01ffff00, 0x00ff00ff01ff0001, 0x00ff00ff01ff0100,
			
 
				+    0x00ff00ff0100ffff, 0x00ff00ff0100ff01, 0x00ff00ff01000000, 0x00ff00ff0101ffff,
			
 
				+    0x00ff00ff0101ff00, 0x00ff00ff01010100, 0x00ff0000ffffff00, 0x00ff0000ffffff01,
			
 
				+    0x00ff0000ffff0000, 0x00ff0000ffff0101, 0x00ff0000ff00ff00, 0x00ff0000ff0000ff,
			
 
				+    0x00ff0000ff000000, 0x00ff0000ff000001, 0x00ff0000ff000100, 0x00ff0000ff01ffff,
			
 
				+    0x00ff0000ff010000, 0x00ff0000ff010101, 0x00ff000000ffff00, 0x00ff000000ff00ff,
			
 
				+    0x00ff000000ff0000, 0x00ff000000ff0001, 0x00ff000000ff0100, 0x00ff00000000ffff,
			
 
				+    0x00ff00000000ff00, 0x00ff0000000000ff, 0x00ff000000000000, 0x00ff000000000001,
			
 
				+    0x00ff0000000001ff, 0x00ff000000000100, 0x00ff00000001ff00, 0x00ff0000000100ff,
			
 
				+    0x00ff000000010000, 0x00ff000000010001, 0x00ff000000010100, 0x00ff000001ffff01,
			
 
				+    0x00ff000001ff00ff, 0x00ff000001ff0000, 0x00ff000001ff01ff, 0x00ff00000100ff00,
			
 
				+    0x00ff0000010000ff, 0x00ff000001000000, 0x00ff000001000001, 0x00ff000001000100,
			
 
				+    0x00ff000001000101, 0x00ff000001010000, 0x00ff0000010101ff, 0x00ff000001010101,
			
 
				+    0x00ff0001ffffff00, 0x00ff0001ffff0000, 0x00ff0001ffff0100, 0x00ff0001ff0000ff,
			
 
				+    0x00ff0001ff000000, 0x00ff0001ff0001ff, 0x00ff0001ff000101, 0x00ff0001ff01ff00,
			
 
				+    0x00ff0001ff0100ff, 0x00ff0001ff010100, 0x00ff000100ffffff, 0x00ff000100ffff01,
			
 
				+    0x00ff000100ff0000, 0x00ff000100ff01ff, 0x00ff00010000ffff, 0x00ff00010000ff00,
			
 
				+    0x00ff00010000ff01, 0x00ff000100000000, 0x00ff000100000001, 0x00ff000100000100,
			
 
				+    0x00ff00010001ff01, 0x00ff000100010000, 0x00ff0001000101ff, 0x00ff000101ffff00,
			
 
				+    0x00ff000101ff0000, 0x00ff000101ff0101, 0x00ff0001010000ff, 0x00ff000101000000,
			
 
				+    0x00ff00010101ff00, 0x00ff0001010100ff, 0x00ff000101010001, 0x00ff01ffffff0000,
			
 
				+    0x00ff01ffff00ff00, 0x00ff01ffff000000, 0x00ff01ffff000101, 0x00ff01ffff010000,
			
 
				+    0x00ff01ff00ffff01, 0x00ff01ff00ff0100, 0x00ff01ff0000ffff, 0x00ff01ff00000000,
			
 
				+    0x00ff01ff000001ff, 0x00ff01ff0001ff00, 0x00ff01ff000100ff, 0x00ff01ff00010001,
			
 
				+    0x00ff01ff00010100, 0x00ff01ff01ff0000, 0x00ff01ff0100ff00, 0x00ff01ff010000ff,
			
 
				+    0x00ff01ff01000001, 0x00ff01ff01000100, 0x00ff01ff01010000, 0x00ff0100ffffff00,
			
 
				+    0x00ff0100ffff0000, 0x00ff0100ffff0001, 0x00ff0100ffff0101, 0x00ff0100ff00ffff,
			
 
				+    0x00ff0100ff0000ff, 0x00ff0100ff000000, 0x00ff0100ff0001ff, 0x00ff0100ff01ff00,
			
 
				+    0x00ff0100ff0100ff, 0x00ff0100ff010001, 0x00ff010000ffffff, 0x00ff010000ff0000,
			
 
				+    0x00ff010000ff0101, 0x00ff01000000ff00, 0x00ff01000000ff01, 0x00ff0100000000ff,
			
 
				+    0x00ff010000000000, 0x00ff010000000001, 0x00ff010000000100, 0x00ff01000001ffff,
			
 
				+    0x00ff01000001ff01, 0x00ff010000010000, 0x00ff010000010001, 0x00ff010000010101,
			
 
				+    0x00ff010001ff0001, 0x00ff010001ff0100, 0x00ff01000100ff01, 0x00ff010001000000,
			
 
				+    0x00ff010001000001, 0x00ff0100010001ff, 0x00ff01000101ff00, 0x00ff0100010100ff,
			
 
				+    0x00ff010001010001, 0x00ff010001010100, 0x00ff0101ff000001, 0x00ff010100ff00ff,
			
 
				+    0x00ff010100ff0001, 0x00ff010100ff0100, 0x00ff010100000000, 0x00ff0101000001ff,
			
 
				+    0x00ff010100000101, 0x00ff0101000100ff, 0x00ff010100010100, 0x00ff0101010000ff,
			
 
				+    0x00ff010101010000, 0x0000ffffffffff00, 0x0000ffffffff00ff, 0x0000ffffffff0000,
			
 
				+    0x0000ffffffff0001, 0x0000ffffffff0100, 0x0000ffffff00ff01, 0x0000ffffff000000,
			
 
				+    0x0000ffffff000101, 0x0000ffffff01ff00, 0x0000ffffff0100ff, 0x0000ffffff010100,
			
 
				+    0x0000ffff00ffffff, 0x0000ffff00ff0000, 0x0000ffff00ff01ff, 0x0000ffff0000ff00,
			
 
				+    0x0000ffff000000ff, 0x0000ffff00000000, 0x0000ffff00000001, 0x0000ffff00000100,
			
 
				+    0x0000ffff00010000, 0x0000ffff000101ff, 0x0000ffff01ff0001, 0x0000ffff01ff0100,
			
 
				+    0x0000ffff01000000, 0x0000ffff010001ff, 0x0000ffff0101ffff, 0x0000ffff0101ff00,
			
 
				+    0x0000ffff01010001, 0x0000ffff01010100, 0x0000ff00ffff0000, 0x0000ff00ffff01ff,
			
 
				+    0x0000ff00ffff0100, 0x0000ff00ffff0101, 0x0000ff00ff00ff00, 0x0000ff00ff0000ff,
			
 
				+    0x0000ff00ff000000, 0x0000ff00ff000001, 0x0000ff00ff0001ff, 0x0000ff00ff000100,
			
 
				+    0x0000ff00ff01ffff, 0x0000ff00ff010000, 0x0000ff00ff010001, 0x0000ff00ff0101ff,
			
 
				+    0x0000ff00ff010101, 0x0000ff0000ffff00, 0x0000ff0000ff00ff, 0x0000ff0000ff0000,
			
 
				+    0x0000ff0000ff0001, 0x0000ff0000ff0100, 0x0000ff000000ffff, 0x0000ff000000ff00,
			
 
				+    0x0000ff000000ff01, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
			
 
				+    0x0000ff00000001ff, 0x0000ff0000000100, 0x0000ff0000000101, 0x0000ff000001ff00,
			
 
				+    0x0000ff00000100ff, 0x0000ff0000010000, 0x0000ff0000010001, 0x0000ff0000010100,
			
 
				+    0x0000ff0001ffff01, 0x0000ff0001ff0000, 0x0000ff000100ff00, 0x0000ff00010000ff,
			
 
				+    0x0000ff0001000000, 0x0000ff0001000001, 0x0000ff0001000100, 0x0000ff000101ffff,
			
 
				+    0x0000ff0001010000, 0x0000ff0001010101, 0x0000ff01ffffff00, 0x0000ff01ffff0001,
			
 
				+    0x0000ff01ff00ff01, 0x0000ff01ff000000, 0x0000ff01ff000101, 0x0000ff01ff01ff00,
			
 
				+    0x0000ff01ff0100ff, 0x0000ff0100ffff01, 0x0000ff0100ff0000, 0x0000ff0100ff0101,
			
 
				+    0x0000ff010000ff00, 0x0000ff01000000ff, 0x0000ff0100000000, 0x0000ff0100000001,
			
 
				+    0x0000ff0100000100, 0x0000ff010001ff01, 0x0000ff0100010000, 0x0000ff0101ff0000,
			
 
				+    0x0000ff010100ffff, 0x0000ff010100ff01, 0x0000ff0101000000, 0x0000ff0101000100,
			
 
				+    0x0000ff0101000101, 0x0000ff01010100ff, 0x000000ffffff00ff, 0x000000ffffff0000,
			
 
				+    0x000000ffff00ff00, 0x000000ffff0000ff, 0x000000ffff000000, 0x000000ffff000001,
			
 
				+    0x000000ffff0001ff, 0x000000ffff000100, 0x000000ffff01ff00, 0x000000ffff010000,
			
 
				+    0x000000ffff0101ff, 0x000000ffff010101, 0x000000ff00ffff00, 0x000000ff00ff00ff,
			
 
				+    0x000000ff00ff0000, 0x000000ff00ff0001, 0x000000ff00ff0100, 0x000000ff00ff0101,
			
 
				+    0x000000ff0000ffff, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
			
 
				+    0x000000ff00000001, 0x000000ff000001ff, 0x000000ff00000100, 0x000000ff00000101,
			
 
				+    0x000000ff0001ff00, 0x000000ff0001ff01, 0x000000ff000100ff, 0x000000ff00010000,
			
 
				+    0x000000ff00010001, 0x000000ff00010100, 0x000000ff01ffffff, 0x000000ff01ff01ff,
			
 
				+    0x000000ff01ff0101, 0x000000ff0100ff00, 0x000000ff010000ff, 0x000000ff01000000,
			
 
				+    0x000000ff01000001, 0x000000ff01000100, 0x000000ff0101ff00, 0x000000ff010100ff,
			
 
				+    0x000000ff01010000, 0x000000ff01010101, 0x00000000ffffff00, 0x00000000ffffff01,
			
 
				+    0x00000000ffff00ff, 0x00000000ffff0000, 0x00000000ffff0001, 0x00000000ffff0100,
			
 
				+    0x00000000ff00ffff, 0x00000000ff00ff00, 0x00000000ff00ff01, 0x00000000ff0000ff,
			
 
				+    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff000101,
			
 
				+    0x00000000ff01ff00, 0x00000000ff0100ff, 0x00000000ff010000, 0x00000000ff010001,
			
 
				+    0x00000000ff010100, 0x0000000000ffffff, 0x0000000000ffff00, 0x0000000000ffff01,
			
 
				+    0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, 0x0000000000ff01ff,
			
 
				+    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
			
 
				+    0x00000000000000ff, 0x0000000000000000, 0x0000000000000001, 0x00000000000001ff,
			
 
				+    0x0000000000000100, 0x0000000000000101, 0x000000000001ffff, 0x000000000001ff00,
			
 
				+    0x00000000000100ff, 0x0000000000010000, 0x0000000000010001, 0x00000000000101ff,
			
 
				+    0x0000000000010100, 0x0000000000010101, 0x0000000001ffff00, 0x0000000001ff00ff,
			
 
				+    0x0000000001ff0000, 0x0000000001ff0100, 0x0000000001ff0101, 0x000000000100ffff,
			
 
				+    0x000000000100ff00, 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001,
			
 
				+    0x00000000010001ff, 0x0000000001000100, 0x000000000101ff00, 0x00000000010100ff,
			
 
				+    0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x00000001ffffffff,
			
 
				+    0x00000001ffffff00, 0x00000001ffffff01, 0x00000001ffff00ff, 0x00000001ffff0001,
			
 
				+    0x00000001ffff01ff, 0x00000001ffff0100, 0x00000001ff00ff00, 0x00000001ff0000ff,
			
 
				+    0x00000001ff000000, 0x00000001ff0001ff, 0x00000001ff000100, 0x00000001ff01ffff,
			
 
				+    0x00000001ff01ff00, 0x00000001ff01ff01, 0x00000001ff0100ff, 0x00000001ff010000,
			
 
				+    0x00000001ff010001, 0x00000001ff0101ff, 0x00000001ff010100, 0x0000000100ffff00,
			
 
				+    0x0000000100ff0000, 0x0000000100ff0001, 0x0000000100ff01ff, 0x0000000100ff0100,
			
 
				+    0x0000000100ff0101, 0x000000010000ffff, 0x000000010000ff00, 0x000000010000ff01,
			
 
				+    0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, 0x00000001000001ff,
			
 
				+    0x0000000100000100, 0x0000000100000101, 0x000000010001ff00, 0x00000001000100ff,
			
 
				+    0x0000000100010000, 0x0000000100010100, 0x0000000101ffff01, 0x0000000101ff0000,
			
 
				+    0x0000000101ff0001, 0x0000000101ff01ff, 0x0000000101ff0100, 0x0000000101ff0101,
			
 
				+    0x000000010100ff00, 0x0000000101000000, 0x0000000101000101, 0x000000010101ff01,
			
 
				+    0x0000000101010000, 0x0000000101010001, 0x00000001010101ff, 0x0000000101010100,
			
 
				+    0x000001ffffff00ff, 0x000001ffffff0000, 0x000001ffffff0001, 0x000001ffffff0100,
			
 
				+    0x000001ffff00ffff, 0x000001ffff000000, 0x000001ffff0001ff, 0x000001ffff01ff00,
			
 
				+    0x000001ffff010101, 0x000001ff00ff0000, 0x000001ff00ff01ff, 0x000001ff00ff0101,
			
 
				+    0x000001ff0000ff00, 0x000001ff000000ff, 0x000001ff00000000, 0x000001ff00000001,
			
 
				+    0x000001ff000001ff, 0x000001ff00000100, 0x000001ff0001ffff, 0x000001ff0001ff01,
			
 
				+    0x000001ff000100ff, 0x000001ff00010000, 0x000001ff01ffff01, 0x000001ff01ff0100,
			
 
				+    0x000001ff0100ffff, 0x000001ff0100ff01, 0x000001ff01000000, 0x000001ff010001ff,
			
 
				+    0x000001ff0101ff00, 0x000001ff01010100, 0x00000100ffffff00, 0x00000100ffffff01,
			
 
				+    0x00000100ffff0000, 0x00000100ffff0101, 0x00000100ff00ff00, 0x00000100ff0000ff,
			
 
				+    0x00000100ff000000, 0x00000100ff000001, 0x00000100ff000100, 0x00000100ff010000,
			
 
				+    0x0000010000ffff00, 0x0000010000ff00ff, 0x0000010000ff0000, 0x0000010000ff0001,
			
 
				+    0x0000010000ff0100, 0x000001000000ffff, 0x000001000000ff00, 0x000001000000ff01,
			
 
				+    0x00000100000000ff, 0x0000010000000000, 0x0000010000000001, 0x00000100000001ff,
			
 
				+    0x0000010000000100, 0x0000010000000101, 0x000001000001ff00, 0x00000100000100ff,
			
 
				+    0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010001ffff00,
			
 
				+    0x0000010001ff0000, 0x0000010001ff0100, 0x000001000100ff00, 0x00000100010000ff,
			
 
				+    0x0000010001000000, 0x0000010001000001, 0x00000100010001ff, 0x0000010001000100,
			
 
				+    0x0000010001010000, 0x00000101ffff00ff, 0x00000101ffff01ff, 0x00000101ff000000,
			
 
				+    0x00000101ff000101, 0x00000101ff01ffff, 0x00000101ff010000, 0x00000101ff010001,
			
 
				+    0x00000101ff010100, 0x0000010100ff0000, 0x0000010100ff01ff, 0x0000010100ff0100,
			
 
				+    0x000001010000ff00, 0x0000010100000000, 0x0000010100000001, 0x00000101000001ff,
			
 
				+    0x0000010100000100, 0x000001010001ff01, 0x0000010100010000, 0x00000101000101ff,
			
 
				+    0x0000010100010101, 0x0000010101ffff00, 0x0000010101ff0101, 0x000001010100ff01,
			
 
				+    0x0000010101000000, 0x0000010101000001, 0x00000101010001ff, 0x0000010101000101,
			
 
				+    0x000001010101ff00, 0x0001ffffffff0000, 0x0001ffffff0000ff, 0x0001ffffff000001,
			
 
				+    0x0001ffffff000100, 0x0001ffffff010000, 0x0001ffff00ff00ff, 0x0001ffff0000ffff,
			
 
				+    0x0001ffff00000000, 0x0001ffff00000001, 0x0001ffff000001ff, 0x0001ffff00000101,
			
 
				+    0x0001ffff0001ff00, 0x0001ffff000100ff, 0x0001ffff00010001, 0x0001ffff00010100,
			
 
				+    0x0001ffff01ffff00, 0x0001ffff01000001, 0x0001ffff01010000, 0x0001ff00ffffff00,
			
 
				+    0x0001ff00ffff00ff, 0x0001ff00ffff0001, 0x0001ff00ffff0100, 0x0001ff00ff00ff01,
			
 
				+    0x0001ff00ff000000, 0x0001ff00ff01ff00, 0x0001ff00ff01ff01, 0x0001ff00ff010001,
			
 
				+    0x0001ff00ff010100, 0x0001ff0000ff0000, 0x0001ff0000ff0100, 0x0001ff000000ff00,
			
 
				+    0x0001ff0000000000, 0x0001ff0000000001, 0x0001ff0000000100, 0x0001ff0000010000,
			
 
				+    0x0001ff0000010001, 0x0001ff0000010101, 0x0001ff0001ff00ff, 0x0001ff0001ff0101,
			
 
				+    0x0001ff000100ff01, 0x0001ff0001000000, 0x0001ff000101ff00, 0x0001ff0001010001,
			
 
				+    0x0001ff0001010100, 0x0001ff01ff00ff00, 0x0001ff01ff000001, 0x0001ff01ff000100,
			
 
				+    0x0001ff0100ffffff, 0x0001ff0100ffff00, 0x0001ff0100ff0001, 0x0001ff0100000000,
			
 
				+    0x0001ff0100000001, 0x0001ff01000001ff, 0x0001ff010001ffff, 0x0001ff0101ff0000,
			
 
				+    0x0001ff010100ff00, 0x0001ff0101000001, 0x0001ff0101010000, 0x000100ffff00ff00,
			
 
				+    0x000100ffff00ff01, 0x000100ffff000000, 0x000100ffff000001, 0x000100ffff000101,
			
 
				+    0x000100ffff01ff00, 0x000100ffff010001, 0x000100ffff010100, 0x000100ff00ffffff,
			
 
				+    0x000100ff00ffff01, 0x000100ff00ff0000, 0x000100ff00ff01ff, 0x000100ff00ff0101,
			
 
				+    0x000100ff0000ff00, 0x000100ff000000ff, 0x000100ff00000000, 0x000100ff00000001,
			
 
				+    0x000100ff00000100, 0x000100ff00000101, 0x000100ff0001ffff, 0x000100ff0001ff01,
			
 
				+    0x000100ff00010000, 0x000100ff01ff00ff, 0x000100ff01ff0000, 0x000100ff01ff0100,
			
 
				+    0x000100ff0100ffff, 0x000100ff0100ff01, 0x000100ff010000ff, 0x000100ff01000000,
			
 
				+    0x000100ff01000001, 0x000100ff010001ff, 0x000100ff01000101, 0x000100ff0101ff00,
			
 
				+    0x000100ff010100ff, 0x000100ff01010100, 0x00010000ffff0000, 0x00010000ffff01ff,
			
 
				+    0x00010000ffff0101, 0x00010000ff00ff00, 0x00010000ff000000, 0x00010000ff000001,
			
 
				+    0x00010000ff000100, 0x0001000000ff00ff, 0x0001000000ff0000, 0x0001000000ff0001,
			
 
				+    0x0001000000ff0100, 0x000100000000ffff, 0x000100000000ff00, 0x00010000000000ff,
			
 
				+    0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x000100000001ff00,
			
 
				+    0x00010000000100ff, 0x0001000000010000, 0x0001000000010001, 0x0001000000010100,
			
 
				+    0x0001000001ff0001, 0x0001000001ff0100, 0x0001000001ff0101, 0x000100000100ff00,
			
 
				+    0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101,
			
 
				+    0x000100000101ff01, 0x0001000001010000, 0x0001000001010001, 0x00010000010101ff,
			
 
				+    0x00010001ffffff01, 0x00010001ffff0100, 0x00010001ff000000, 0x00010001ff01ffff,
			
 
				+    0x00010001ff010001, 0x00010001ff0101ff, 0x00010001ff010100, 0x0001000100ffffff,
			
 
				+    0x0001000100ff0000, 0x0001000100ff01ff, 0x0001000100ff0101, 0x000100010000ff00,
			
 
				+    0x00010001000000ff, 0x0001000100000000, 0x0001000100000001, 0x00010001000001ff,
			
 
				+    0x0001000100000101, 0x000100010001ffff, 0x0001000100010000, 0x00010001000101ff,
			
 
				+    0x0001000101ffffff, 0x0001000101ffff01, 0x0001000101ff0000, 0x0001000101ff0101,
			
 
				+    0x00010001010000ff, 0x0001000101000001, 0x00010001010001ff, 0x0001000101000100,
			
 
				+    0x000100010101ffff, 0x00010001010100ff, 0x0001000101010001, 0x0001000101010101,
			
 
				+    0x000101ffff000001, 0x000101ffff000100, 0x000101ffff010000, 0x000101ff00ffff00,
			
 
				+    0x000101ff0000ff01, 0x000101ff00000000, 0x000101ff00000101, 0x000101ff0001ff00,
			
 
				+    0x000101ff00010100, 0x000101ff01ff0000, 0x000101ff0100ff00, 0x000101ff010001ff,
			
 
				+    0x000101ff01010001, 0x00010100ffffff00, 0x00010100ffff00ff, 0x00010100ff00ffff,
			
 
				+    0x00010100ff000000, 0x00010100ff01ff00, 0x00010100ff0100ff, 0x00010100ff010001,
			
 
				+    0x00010100ff010100, 0x0001010000ffffff, 0x0001010000ffff00, 0x0001010000ff0000,
			
 
				+    0x0001010000ff0001, 0x0001010000ff01ff, 0x000101000000ff00, 0x00010100000000ff,
			
 
				+    0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x000101000001ffff,
			
 
				+    0x0001010000010000, 0x0001010000010101, 0x0001010001ffff01, 0x0001010001ff00ff,
			
 
				+    0x0001010001ff0101, 0x0001010001000000, 0x000101000101ff00, 0x00010100010100ff,
			
 
				+    0x0001010001010000, 0x0001010001010100, 0x00010101ff00ff00, 0x00010101ff000001,
			
 
				+    0x00010101ff0001ff, 0x0001010100ffff00, 0x0001010100ff00ff, 0x0001010100ff0100,
			
 
				+    0x000101010000ffff, 0x0001010100000000, 0x00010101000001ff, 0x0001010100000101,
			
 
				+    0x00010101000100ff, 0x0001010100010000, 0x0001010100010100, 0x0001010101ff0001,
			
 
				+    0x00010101010000ff, 0x00010101010001ff, 0x0001010101000101, 0x0001010101010001,
			
 
				+    0x01ffffffffffffff, 0x01ffffffffffff01, 0x01ffffffffff01ff, 0x01ffffffffff0101,
			
 
				+    0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, 0x01ffffffff010101,
			
 
				+    0x01ffffff00ff0000, 0x01ffffff0000ffff, 0x01ffffff0000ff00, 0x01ffffff000000ff,
			
 
				+    0x01ffffff00000001, 0x01ffffff00000100, 0x01ffffff00010000, 0x01ffffff01ffffff,
			
 
				+    0x01ffffff01ffff01, 0x01ffffff01ff01ff, 0x01ffffff01ff0101, 0x01ffffff01000000,
			
 
				+    0x01ffffff0101ffff, 0x01ffffff0101ff01, 0x01ffffff010101ff, 0x01ffffff01010101,
			
 
				+    0x01ffff00ffff0000, 0x01ffff00ff00ff00, 0x01ffff00ff0000ff, 0x01ffff00ff000001,
			
 
				+    0x01ffff00ff000100, 0x01ffff00ff010000, 0x01ffff0000ffff00, 0x01ffff0000ff00ff,
			
 
				+    0x01ffff0000ff0100, 0x01ffff000000ffff, 0x01ffff000000ff01, 0x01ffff0000000000,
			
 
				+    0x01ffff0000000001, 0x01ffff00000001ff, 0x01ffff0000000100, 0x01ffff00000100ff,
			
 
				+    0x01ffff0000010001, 0x01ffff0000010100, 0x01ffff0001ff0000, 0x01ffff0001ff0100,
			
 
				+    0x01ffff00010000ff, 0x01ffff0001000001, 0x01ffff0001000100, 0x01ffff0001010000,
			
 
				+    0x01ffff01ffffffff, 0x01ffff01ffffff01, 0x01ffff01ffff01ff, 0x01ffff01ffff0101,
			
 
				+    0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff01ff01, 0x01ffff01ff0101ff,
			
 
				+    0x01ffff01ff010101, 0x01ffff010000ff00, 0x01ffff01000000ff, 0x01ffff0100000100,
			
 
				+    0x01ffff0100010000, 0x01ffff0101ffffff, 0x01ffff0101ffff01, 0x01ffff0101ff01ff,
			
 
				+    0x01ffff0101ff0101, 0x01ffff0101000000, 0x01ffff010101ffff, 0x01ffff010101ff01,
			
 
				+    0x01ffff01010101ff, 0x01ffff0101010101, 0x01ff00ffff0000ff, 0x01ff00ffff000100,
			
 
				+    0x01ff00ff00ffff00, 0x01ff00ff00ff00ff, 0x01ff00ff0000ff00, 0x01ff00ff00000000,
			
 
				+    0x01ff00ff00000101, 0x01ff00ff0001ff00, 0x01ff00ff000100ff, 0x01ff00ff00010100,
			
 
				+    0x01ff00ff010000ff, 0x01ff00ff01000100, 0x01ff0000ffffff00, 0x01ff0000ffff0100,
			
 
				+    0x01ff0000ff00ff01, 0x01ff0000ff000000, 0x01ff0000ff000101, 0x01ff0000ff010001,
			
 
				+    0x01ff0000ff010100, 0x01ff000000ffffff, 0x01ff000000ffff00, 0x01ff000000ff0000,
			
 
				+    0x01ff000000ff01ff, 0x01ff00000000ff00, 0x01ff0000000000ff, 0x01ff000000000000,
			
 
				+    0x01ff000000000001, 0x01ff000000000100, 0x01ff000000000101, 0x01ff000000010000,
			
 
				+    0x01ff000000010001, 0x01ff0000000101ff, 0x01ff000000010101, 0x01ff000001ffff00,
			
 
				+    0x01ff000001ff00ff, 0x01ff000001ff0001, 0x01ff000001ff0100, 0x01ff00000100ffff,
			
 
				+    0x01ff00000100ff01, 0x01ff000001000000, 0x01ff0000010001ff, 0x01ff000001010001,
			
 
				+    0x01ff0001ff00ff00, 0x01ff0001ff000001, 0x01ff0001ff000100, 0x01ff0001ff010000,
			
 
				+    0x01ff000100ffff00, 0x01ff000100ff00ff, 0x01ff000100ff0100, 0x01ff000100ff0101,
			
 
				+    0x01ff00010000ffff, 0x01ff000100000000, 0x01ff000100000100, 0x01ff000100000101,
			
 
				+    0x01ff00010001ff00, 0x01ff000100010001, 0x01ff000100010101, 0x01ff000101ff0000,
			
 
				+    0x01ff00010100ff00, 0x01ff000101000101, 0x01ff0001010100ff, 0x01ff01ffffffffff,
			
 
				+    0x01ff01ffffffff01, 0x01ff01ffffff01ff, 0x01ff01ffffff0101, 0x01ff01ffff000000,
			
 
				+    0x01ff01ffff01ffff, 0x01ff01ffff01ff01, 0x01ff01ffff0101ff, 0x01ff01ffff010101,
			
 
				+    0x01ff01ff00ffff00, 0x01ff01ff00ff0000, 0x01ff01ff0000ff00, 0x01ff01ff000000ff,
			
 
				+    0x01ff01ff00000100, 0x01ff01ff00010000, 0x01ff01ff00010100, 0x01ff01ff01ffffff,
			
 
				+    0x01ff01ff01ffff01, 0x01ff01ff01ff01ff, 0x01ff01ff01ff0101, 0x01ff01ff01000000,
			
 
				+    0x01ff01ff0101ffff, 0x01ff01ff0101ff01, 0x01ff01ff010101ff, 0x01ff01ff01010101,
			
 
				+    0x01ff0100ffff0000, 0x01ff0100ffff0001, 0x01ff0100ff00ff00, 0x01ff0100ff0000ff,
			
 
				+    0x01ff0100ff000001, 0x01ff0100ff010000, 0x01ff010000ffff00, 0x01ff010000ff00ff,
			
 
				+    0x01ff010000ff0001, 0x01ff010000ff0100, 0x01ff01000000ffff, 0x01ff01000000ff01,
			
 
				+    0x01ff010000000000, 0x01ff010000000101, 0x01ff01000001ff00, 0x01ff0100000100ff,
			
 
				+    0x01ff010001ff0000, 0x01ff010001000001, 0x01ff010001000100, 0x01ff010001010000,
			
 
				+    0x01ff0101ffffffff, 0x01ff0101ffffff01, 0x01ff0101ffff01ff, 0x01ff0101ffff0101,
			
 
				+    0x01ff0101ff000000, 0x01ff0101ff01ffff, 0x01ff0101ff01ff01, 0x01ff0101ff0101ff,
			
 
				+    0x01ff0101ff010101, 0x01ff010100ff0000, 0x01ff01010000ff00, 0x01ff0101000000ff,
			
 
				+    0x01ff010100000001, 0x01ff010101ffffff, 0x01ff010101ffff01, 0x01ff010101ff01ff,
			
 
				+    0x01ff010101ff0101, 0x01ff010101000000, 0x01ff01010101ffff, 0x01ff01010101ff01,
			
 
				+    0x01ff0101010101ff, 0x01ff010101010101, 0x0100ffffffff0000, 0x0100ffffff00ff00,
			
 
				+    0x0100ffffff000001, 0x0100ffffff0001ff, 0x0100ffffff000100, 0x0100ffffff010000,
			
 
				+    0x0100ffff00ffff00, 0x0100ffff00ff0001, 0x0100ffff00ff0100, 0x0100ffff00000000,
			
 
				+    0x0100ffff000001ff, 0x0100ffff00000101, 0x0100ffff00010100, 0x0100ffff00010101,
			
 
				+    0x0100ffff01ff0000, 0x0100ffff0100ff00, 0x0100ffff010000ff, 0x0100ffff01000001,
			
 
				+    0x0100ffff01000100, 0x0100ffff01010000, 0x0100ff00ffffff00, 0x0100ff00ffff00ff,
			
 
				+    0x0100ff00ffff0001, 0x0100ff00ffff0100, 0x0100ff00ff00ffff, 0x0100ff00ff000000,
			
 
				+    0x0100ff00ff0001ff, 0x0100ff00ff000101, 0x0100ff00ff01ff00, 0x0100ff00ff0100ff,
			
 
				+    0x0100ff00ff010001, 0x0100ff00ff010100, 0x0100ff0000ffffff, 0x0100ff0000ff0000,
			
 
				+    0x0100ff000000ffff, 0x0100ff000000ff00, 0x0100ff00000000ff, 0x0100ff0000000000,
			
 
				+    0x0100ff0000000001, 0x0100ff0000000100, 0x0100ff000001ff01, 0x0100ff0000010000,
			
 
				+    0x0100ff0001ff00ff, 0x0100ff0001ff0001, 0x0100ff000100ff01, 0x0100ff0001000000,
			
 
				+    0x0100ff00010001ff, 0x0100ff000101ff00, 0x0100ff00010100ff, 0x0100ff0001010001,
			
 
				+    0x0100ff0001010100, 0x0100ff01ffff0000, 0x0100ff01ff00ff00, 0x0100ff01ff0000ff,
			
 
				+    0x0100ff01ff000100, 0x0100ff01ff010000, 0x0100ff0100ff00ff, 0x0100ff0100ff0001,
			
 
				+    0x0100ff0100ff0100, 0x0100ff010000ffff, 0x0100ff010000ff01, 0x0100ff0100000000,
			
 
				+    0x0100ff01000001ff, 0x0100ff0100010001, 0x0100ff0100010100, 0x0100ff0101ff0000,
			
 
				+    0x0100ff01010000ff, 0x0100ff0101000001, 0x0100ff0101010100, 0x010000ffffffff00,
			
 
				+    0x010000ffffff00ff, 0x010000ffffff0001, 0x010000ffff00ffff, 0x010000ffff000000,
			
 
				+    0x010000ffff0001ff, 0x010000ffff010001, 0x010000ff00ffffff, 0x010000ff00ff0101,
			
 
				+    0x010000ff0000ff00, 0x010000ff000000ff, 0x010000ff00000000, 0x010000ff00000001,
			
 
				+    0x010000ff000001ff, 0x010000ff00000100, 0x010000ff0001ffff, 0x010000ff0001ff00,
			
 
				+    0x010000ff0001ff01, 0x010000ff00010000, 0x010000ff01ff00ff, 0x010000ff01ff0001,
			
 
				+    0x010000ff0100ff01, 0x010000ff010000ff, 0x010000ff01000000, 0x010000ff010001ff,
			
 
				+    0x010000ff0101ff00, 0x010000ff01010100, 0x01000000ffffffff, 0x01000000ffff0000,
			
 
				+    0x01000000ffff01ff, 0x01000000ffff0101, 0x01000000ff00ffff, 0x01000000ff00ff00,
			
 
				+    0x01000000ff0000ff, 0x01000000ff000000, 0x01000000ff000001, 0x01000000ff000100,
			
 
				+    0x01000000ff01ff00, 0x01000000ff010000, 0x01000000ff010100, 0x01000000ff010101,
			
 
				+    0x0100000000ffff00, 0x0100000000ff00ff, 0x0100000000ff0000, 0x0100000000ff0001,
			
 
				+    0x0100000000ff0100, 0x010000000000ffff, 0x010000000000ff00, 0x010000000000ff01,
			
 
				+    0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, 0x01000000000001ff,
			
 
				+    0x0100000000000100, 0x0100000000000101, 0x010000000001ff00, 0x01000000000100ff,
			
 
				+    0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000001ffff00,
			
 
				+    0x0100000001ff0000, 0x0100000001ff01ff, 0x010000000100ff00, 0x010000000100ff01,
			
 
				+    0x01000000010000ff, 0x0100000001000000, 0x0100000001000001, 0x0100000001000100,
			
 
				+    0x0100000001000101, 0x010000000101ffff, 0x010000000101ff01, 0x0100000001010000,
			
 
				+    0x01000000010101ff, 0x0100000001010101, 0x01000001ffffff00, 0x01000001ffff00ff,
			
 
				+    0x01000001ff00ffff, 0x01000001ff000000, 0x01000001ff000100, 0x01000001ff01ffff,
			
 
				+    0x01000001ff010001, 0x01000001ff010100, 0x0100000100ff0000, 0x0100000100ff01ff,
			
 
				+    0x0100000100ff0100, 0x010000010000ff00, 0x010000010000ff01, 0x0100000100000000,
			
 
				+    0x0100000100000001, 0x0100000100000100, 0x0100000100010000, 0x01000001000101ff,
			
 
				+    0x0100000101ffff01, 0x0100000101ff00ff, 0x0100000101ff0100, 0x0100000101ff0101,
			
 
				+    0x010000010100ff01, 0x01000001010000ff, 0x0100000101000000, 0x01000001010100ff,
			
 
				+    0x0100000101010001, 0x0100000101010100, 0x010001ffffff0000, 0x010001ffff000001,
			
 
				+    0x010001ffff000100, 0x010001ffff010000, 0x010001ff00ffff00, 0x010001ff00ff0001,
			
 
				+    0x010001ff0000ffff, 0x010001ff0000ff01, 0x010001ff00000000, 0x010001ff00000001,
			
 
				+    0x010001ff00000101, 0x010001ff000100ff, 0x010001ff00010000, 0x010001ff01ff0000,
			
 
				+    0x010001ff0100ff00, 0x010001ff01000001, 0x010001ff01000100, 0x010001ff01010000,
			
 
				+    0x01000100ffff00ff, 0x01000100ffff0001, 0x01000100ffff0100, 0x01000100ff00ffff,
			
 
				+    0x01000100ff00ff01, 0x01000100ff000000, 0x01000100ff0001ff, 0x01000100ff000101,
			
 
				+    0x01000100ff01ffff, 0x01000100ff01ff00, 0x01000100ff0100ff, 0x01000100ff010001,
			
 
				+    0x0100010000ffffff, 0x0100010000ffff01, 0x0100010000ff0000, 0x0100010000ff01ff,
			
 
				+    0x0100010000ff0101, 0x010001000000ff00, 0x01000100000000ff, 0x0100010000000000,
			
 
				+    0x0100010000000001, 0x0100010000000100, 0x010001000001ff01, 0x0100010000010000,
			
 
				+    0x0100010000010001, 0x0100010000010101, 0x0100010001ffff00, 0x0100010001ff00ff,
			
 
				+    0x010001000100ffff, 0x010001000100ff01, 0x0100010001000000, 0x0100010001000101,
			
 
				+    0x010001000101ff00, 0x0100010001010001, 0x01000101ffff0000, 0x01000101ff000000,
			
 
				+    0x01000101ff010000, 0x0100010100ff00ff, 0x0100010100ff0001, 0x0100010100ff0100,
			
 
				+    0x010001010000ffff, 0x0100010100000000, 0x01000101000001ff, 0x010001010001ff00,
			
 
				+    0x0100010101ff0000, 0x010001010100ff00, 0x01000101010000ff, 0x0100010101000000,
			
 
				+    0x0100010101000001, 0x0101ffffffffffff, 0x0101ffffffffff01, 0x0101ffffffff01ff,
			
 
				+    0x0101ffffffff0101, 0x0101ffffff000000, 0x0101ffffff01ffff, 0x0101ffffff01ff01,
			
 
				+    0x0101ffffff0101ff, 0x0101ffffff010101, 0x0101ffff00ff0000, 0x0101ffff0000ff00,
			
 
				+    0x0101ffff000000ff, 0x0101ffff00000001, 0x0101ffff00000100, 0x0101ffff01ffffff,
			
 
				+    0x0101ffff01ffff01, 0x0101ffff01ff01ff, 0x0101ffff01ff0101, 0x0101ffff01000000,
			
 
				+    0x0101ffff0101ffff, 0x0101ffff0101ff01, 0x0101ffff010101ff, 0x0101ffff01010101,
			
 
				+    0x0101ff00ffff0000, 0x0101ff00ffff0100, 0x0101ff00ff00ff00, 0x0101ff00ff0000ff,
			
 
				+    0x0101ff00ff000001, 0x0101ff00ff000100, 0x0101ff00ff000101, 0x0101ff0000ff0001,
			
 
				+    0x0101ff0000ff0100, 0x0101ff000000ff00, 0x0101ff0000000000, 0x0101ff00000001ff,
			
 
				+    0x0101ff0000000101, 0x0101ff000001ff00, 0x0101ff00000100ff, 0x0101ff0001ff0000,
			
 
				+    0x0101ff000100ffff, 0x0101ff000100ff01, 0x0101ff0001000001, 0x0101ff0001000100,
			
 
				+    0x0101ff01ffffff01, 0x0101ff01ffff01ff, 0x0101ff01ffff0101, 0x0101ff01ff00ffff,
			
 
				+    0x0101ff01ff000100, 0x0101ff01ff01ff01, 0x0101ff01ff0101ff, 0x0101ff01ff010101,
			
 
				+    0x0101ff0100ff0000, 0x0101ff010000ff00, 0x0101ff0100000001, 0x0101ff0100000100,
			
 
				+    0x0101ff0100010000, 0x0101ff0101ffffff, 0x0101ff0101ffff01, 0x0101ff0101ff01ff,
			
 
				+    0x0101ff0101ff0101, 0x0101ff0101000000, 0x0101ff010101ffff, 0x0101ff010101ff01,
			
 
				+    0x0101ff01010101ff, 0x0101ff0101010101, 0x010100ffff000100, 0x010100ffff010000,
			
 
				+    0x010100ff00ffff00, 0x010100ff00ff00ff, 0x010100ff0000ffff, 0x010100ff000000ff,
			
 
				+    0x010100ff00000000, 0x010100ff000001ff, 0x010100ff00000101, 0x010100ff0001ff00,
			
 
				+    0x010100ff00010000, 0x010100ff00010001, 0x010100ff000101ff, 0x010100ff00010100,
			
 
				+    0x010100ff01ff0000, 0x01010000ffff0001, 0x01010000ffff0100, 0x01010000ff00ffff,
			
 
				+    0x01010000ff00ff01, 0x01010000ff000000, 0x01010000ff0001ff, 0x01010000ff010001,
			
 
				+    0x01010000ff010100, 0x0101000000ffff01, 0x0101000000ff0000, 0x010100000000ff00,
			
 
				+    0x01010000000000ff, 0x0101000000000000, 0x0101000000000001, 0x0101000000000100,
			
 
				+    0x0101000000010000, 0x0101000000010101, 0x0101000001ffff00, 0x0101000001ff00ff,
			
 
				+    0x0101000001ff0000, 0x0101000001ff0001, 0x0101000001ff0100, 0x010100000100ff01,
			
 
				+    0x0101000001000000, 0x01010000010001ff, 0x01010001ffff0000, 0x01010001ff00ff00,
			
 
				+    0x01010001ff000001, 0x01010001ff000101, 0x01010001ff01ff00, 0x01010001ff010000,
			
 
				+    0x0101000100ff00ff, 0x0101000100ff0001, 0x0101000100ff0101, 0x010100010000ff01,
			
 
				+    0x0101000100000000, 0x0101000100000001, 0x01010001000001ff, 0x010100010001ffff,
			
 
				+    0x010100010001ff01, 0x0101000101ff0001, 0x010100010100ffff, 0x0101000101000000,
			
 
				+    0x0101000101000001, 0x0101000101000100, 0x010100010101ff00, 0x01010001010100ff,
			
 
				+    0x0101000101010001, 0x010101ffffffffff, 0x010101ffffffff01, 0x010101ffffff01ff,
			
 
				+    0x010101ffffff0101, 0x010101ffff01ffff, 0x010101ffff01ff01, 0x010101ffff0101ff,
			
 
				+    0x010101ffff010101, 0x010101ff0000ff00, 0x010101ff000000ff, 0x010101ff00000001,
			
 
				+    0x010101ff00000100, 0x010101ff01ffffff, 0x010101ff01ffff01, 0x010101ff01ff01ff,
			
 
				+    0x010101ff01ff0101, 0x010101ff01000000, 0x010101ff0101ffff, 0x010101ff0101ff01,
			
 
				+    0x010101ff010101ff, 0x010101ff01010101, 0x01010100ffff0000, 0x01010100ff0000ff,
			
 
				+    0x01010100ff000100, 0x01010100ff01ff00, 0x01010100ff010000, 0x0101010000ffff00,
			
 
				+    0x010101000000ffff, 0x0101010000000000, 0x0101010000000101, 0x010101000001ff00,
			
 
				+    0x0101010000010001, 0x0101010000010100, 0x010101000100ffff, 0x0101010001000001,
			
 
				+    0x01010101ffffffff, 0x01010101ffffff01, 0x01010101ffff01ff, 0x01010101ffff0101,
			
 
				+    0x01010101ff01ffff, 0x01010101ff01ff01, 0x01010101ff0101ff, 0x01010101ff010101,
			
 
				+    0x010101010000ff00, 0x01010101000000ff, 0x0101010100000001, 0x0101010101ffffff,
			
 
				+    0x0101010101ffff01, 0x0101010101ff01ff, 0x0101010101ff0101, 0x0101010101000000,
			
 
				+    0x010101010101ffff, 0x010101010101ff01, 0x01010101010101ff, 0x0101010101010101,
			
 
				+GGML_TABLE_END()
			
 
				+#else
			
 
				+GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S)
			
 
				+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
			
 
				+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
			
 
				+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
			
 
				+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
			
 
				+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
			
 
				+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
			
 
				+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
			
 
				+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
			
 
				+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
			
 
				+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
			
 
				+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
			
 
				+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
			
 
				+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
			
 
				+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
			
 
				+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
			
 
				+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
			
 
				+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
			
 
				+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
			
 
				+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
			
 
				+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
			
 
				+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
			
 
				+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
			
 
				+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
			
 
				+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
			
 
				+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
			
 
				+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
			
 
				+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
			
 
				+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
			
 
				+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
			
 
				+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
			
 
				+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
			
 
				+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
			
 
				+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
			
 
				+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
			
 
				+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
			
 
				+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
			
 
				+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
			
 
				+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
			
 
				+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
			
 
				+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
			
 
				+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
			
 
				+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
			
 
				+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
			
 
				+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
			
 
				+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
			
 
				+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
			
 
				+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
			
 
				+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
			
 
				+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
			
 
				+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
			
 
				+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
			
 
				+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
			
 
				+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
			
 
				+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
			
 
				+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
			
 
				+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
			
 
				+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
			
 
				+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
			
 
				+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
			
 
				+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
			
 
				+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
			
 
				+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
			
 
				+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
			
 
				+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
			
 
				+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
			
 
				+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
			
 
				+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
			
 
				+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
			
 
				+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
			
 
				+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
			
 
				+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
			
 
				+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
			
 
				+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
			
 
				+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
			
 
				+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
			
 
				+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
			
 
				+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
			
 
				+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
			
 
				+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
			
 
				+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
			
 
				+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
			
 
				+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
			
 
				+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
			
 
				+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
			
 
				+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
			
 
				+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
			
 
				+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
			
 
				+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
			
 
				+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
			
 
				+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
			
 
				+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
			
 
				+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
			
 
				+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
			
 
				+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
			
 
				+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
			
 
				+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
			
 
				+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
			
 
				+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
			
 
				+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
			
 
				+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
			
 
				+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
			
 
				+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
			
 
				+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
			
 
				+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
			
 
				+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
			
 
				+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
			
 
				+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
			
 
				+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
			
 
				+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
			
 
				+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
			
 
				+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
			
 
				+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
			
 
				+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
			
 
				+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
			
 
				+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
			
 
				+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
			
 
				+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
			
 
				+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
			
 
				+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
			
 
				+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
			
 
				+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
			
 
				+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
			
 
				+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
			
 
				+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
			
 
				+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
			
 
				+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
			
 
				+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
			
 
				+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
			
 
				+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
			
 
				+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
			
 
				+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
			
 
				+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
			
 
				+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
			
 
				+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
			
 
				+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
			
 
				+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
			
 
				+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
			
 
				+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
			
 
				+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
			
 
				+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
			
 
				+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
			
 
				+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
			
 
				+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
			
 
				+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
			
 
				+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
			
 
				+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
			
 
				+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
			
 
				+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
			
 
				+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
			
 
				+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
			
 
				+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
			
 
				+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
			
 
				+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
			
 
				+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
			
 
				+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
			
 
				+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
			
 
				+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
			
 
				+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
			
 
				+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
			
 
				+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
			
 
				+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
			
 
				+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
			
 
				+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
			
 
				+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
			
 
				+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
			
 
				+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
			
 
				+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
			
 
				+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
			
 
				+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
			
 
				+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
			
 
				+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
			
 
				+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
			
 
				+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
			
 
				+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
			
 
				+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
			
 
				+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
			
 
				+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
			
 
				+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
			
 
				+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
			
 
				+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
			
 
				+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
			
 
				+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
			
 
				+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
			
 
				+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
			
 
				+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
			
 
				+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
			
 
				+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
			
 
				+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
			
 
				+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
			
 
				+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
			
 
				+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
			
 
				+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
			
 
				+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
			
 
				+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
			
 
				+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
			
 
				+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
			
 
				+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
			
 
				+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
			
 
				+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
			
 
				+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
			
 
				+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
			
 
				+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
			
 
				+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
			
 
				+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
			
 
				+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
			
 
				+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
			
 
				+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
			
 
				+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
			
 
				+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
			
 
				+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
			
 
				+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
			
 
				+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
			
 
				+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
			
 
				+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
			
 
				+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
			
 
				+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
			
 
				+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
			
 
				+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
			
 
				+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
			
 
				+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
			
 
				+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
			
 
				+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
			
 
				+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
			
 
				+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
			
 
				+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
			
 
				+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
			
 
				+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
			
 
				+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
			
 
				+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
			
 
				+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
			
 
				+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
			
 
				+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
			
 
				+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
			
 
				+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
			
 
				+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
			
 
				+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
			
 
				+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
			
 
				+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
			
 
				+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
			
 
				+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
			
 
				+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
			
 
				+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
			
 
				+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
			
 
				+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
			
 
				+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
			
 
				+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
			
 
				+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
			
 
				+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
			
 
				+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
			
 
				+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
			
 
				+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
			
 
				+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
			
 
				+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
			
 
				+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
			
 
				+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
			
 
				+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
			
 
				+GGML_TABLE_END()
			
 
				+#endif
			
 
				+
			
 
				+#endif // GGML_COMMON_IMPL
			
 
				+#endif // GGML_COMMON_IMPL
			
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -715,9 +715,6 @@ static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
 
				 GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
			
 
				     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
			
 
				     delete ctx;
			
 
				-    
			
 
				-    // HACK: this needs to be freed in msvc
			
 
				-    free(buffer);
			
 
				 }
			
 
				 
			
 
				 GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
			
@@ -3031,7 +3028,7 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
 
				     GGML_UNUSED(params);
			
 
				 }
			
 
				 
			
 
				-// extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
			
 
				+extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
			
 
				 
			
 
				 GGML_CALL int ggml_backend_cuda_reg_devices() {
			
 
				     int device_count = ggml_backend_cuda_get_device_count();
			
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,45 +1,43 @@
 
				-#pragma once

			
 
				-

			
 
				-#include "ggml.h"

			
 
				-#include "ggml-backend.h"

			
 
				-

			
 
				-#ifdef GGML_USE_HIPBLAS

			
 
				-#define GGML_CUDA_NAME "ROCm"

			
 
				-#define GGML_CUBLAS_NAME "hipBLAS"

			
 
				-#else

			
 
				-#define GGML_CUDA_NAME "CUDA"

			
 
				-#define GGML_CUBLAS_NAME "cuBLAS"

			
 
				-#endif

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-extern "C" {

			
 
				-#endif

			
 
				-

			
 
				-#define GGML_CUDA_MAX_DEVICES       16

			
 
				-

			
 
				-// backend API

			
 
				-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);

			
 
				-

			
 
				-GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);

			
 
				-

			
 
				-// device buffer

			
 
				-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);

			
 
				-

			
 
				-// split tensor buffer that splits matrices by rows across multiple devices

			
 
				-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);

			
 
				-

			
 
				-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU

			
 
				-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

			
 
				-

			
 
				-GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();

			
 
				-

			
 
				-GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);

			
 
				-GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);

			
 
				-GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);

			
 
				-

			
 
				-GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);

			
 
				-GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);

			
 
				-

			
 
				-#ifdef  __cplusplus

			
 
				-}

			
 
				-#endif

			
 
				+#pragma once
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+#include "ggml-backend.h"
			
 
				+
			
 
				+#ifdef GGML_USE_HIPBLAS
			
 
				+#define GGML_CUDA_NAME "ROCm"
			
 
				+#define GGML_CUBLAS_NAME "hipBLAS"
			
 
				+#else
			
 
				+#define GGML_CUDA_NAME "CUDA"
			
 
				+#define GGML_CUBLAS_NAME "cuBLAS"
			
 
				+#endif
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define GGML_CUDA_MAX_DEVICES       16
			
 
				+
			
 
				+// backend API
			
 
				+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
			
 
				+
			
 
				+GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
			
 
				+
			
 
				+// device buffer
			
 
				+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
			
 
				+
			
 
				+// split tensor buffer that splits matrices by rows across multiple devices
			
 
				+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
			
 
				+
			
 
				+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
			
 
				+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
			
 
				+
			
 
				+GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
			
 
				+GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
			
 
				+GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
			
 
				+
			
 
				+GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
			
 
				+GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
			
 
				+
			
 
				+#ifdef  __cplusplus
			
 
				+}
			
 
				+#endif
			
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,47 +1,47 @@
 
				-#include "acc.cuh"

			
 
				-

			
 
				-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,

			
 
				-    const int ne10, const int ne11, const int ne12,

			
 
				-    const int nb1, const int nb2, int offset) {

			
 
				-    const int i = blockDim.x * blockIdx.x + threadIdx.x;

			
 
				-    if (i >= ne) {

			
 
				-        return;

			
 
				-    }

			
 
				-    int src1_idx = i - offset;

			
 
				-    int oz = src1_idx / nb2;

			
 
				-    int oy = (src1_idx - (oz * nb2)) / nb1;

			
 
				-    int ox = src1_idx % nb1;

			
 
				-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {

			
 
				-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];

			
 
				-    } else {

			
 
				-        dst[i] = x[i];

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,

			
 
				-    const int ne10, const int ne11, const int ne12,

			
 
				-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {

			
 
				-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;

			
 
				-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const ggml_tensor * src1 = dst->src[1];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    const float * src1_d = (const float *)src1->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

			
 
				-

			
 
				-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32

			
 
				-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32

			
 
				-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused

			
 
				-    int offset = dst->op_params[3] / 4; // offset in bytes

			
 
				-

			
 
				-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);

			
 
				-}

			
 
				+#include "acc.cuh"
			
 
				+
			
 
				+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
			
 
				+    const int ne10, const int ne11, const int ne12,
			
 
				+    const int nb1, const int nb2, int offset) {
			
 
				+    const int i = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+    if (i >= ne) {
			
 
				+        return;
			
 
				+    }
			
 
				+    int src1_idx = i - offset;
			
 
				+    int oz = src1_idx / nb2;
			
 
				+    int oy = (src1_idx - (oz * nb2)) / nb1;
			
 
				+    int ox = src1_idx % nb1;
			
 
				+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
			
 
				+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
			
 
				+    } else {
			
 
				+        dst[i] = x[i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
			
 
				+    const int ne10, const int ne11, const int ne12,
			
 
				+    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
			
 
				+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
			
 
				+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    const float * src1_d = (const float *)src1->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
			
 
				+
			
 
				+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
			
 
				+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
			
 
				+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
			
 
				+    int offset = dst->op_params[3] / 4; // offset in bytes
			
 
				+
			
 
				+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_ACC_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_ACC_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,34 +1,34 @@
 
				-#include "arange.cuh"

			
 
				-

			
 
				-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {

			
 
				-    // blockIDx.x: idx of ne0 / BLOCK_SIZE

			
 
				-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				-    if (nidx >= ne0) {

			
 
				-        return;

			
 
				-    }

			
 
				-    dst[nidx] = start + step * nidx;

			
 
				-}

			
 
				-

			
 
				-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {

			
 
				-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;

			
 
				-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    float start;

			
 
				-    float stop;

			
 
				-    float step;

			
 
				-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));

			
 
				-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));

			
 
				-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));

			
 
				-

			
 
				-    int64_t steps = (int64_t)ceil((stop - start) / step);

			
 
				-    GGML_ASSERT(ggml_nelements(dst) == steps);

			
 
				-

			
 
				-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);

			
 
				-}

			
 
				+#include "arange.cuh"
			
 
				+
			
 
				+static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
			
 
				+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
			
 
				+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if (nidx >= ne0) {
			
 
				+        return;
			
 
				+    }
			
 
				+    dst[nidx] = start + step * nidx;
			
 
				+}
			
 
				+
			
 
				+static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
			
 
				+    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
			
 
				+    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    float start;
			
 
				+    float stop;
			
 
				+    float step;
			
 
				+    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
			
 
				+    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
			
 
				+    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
			
 
				+
			
 
				+    int64_t steps = (int64_t)ceil((stop - start) / step);
			
 
				+    GGML_ASSERT(ggml_nelements(dst) == steps);
			
 
				+
			
 
				+    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_ARANGE_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_ARANGE_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,103 +1,103 @@
 
				-#include "argsort.cuh"

			
 
				-

			
 
				-template<typename T>

			
 
				-static inline __device__ void ggml_cuda_swap(T & a, T & b) {

			
 
				-    T tmp = a;

			
 
				-    a = b;

			
 
				-    b = tmp;

			
 
				-}

			
 
				-

			
 
				-template<ggml_sort_order order>

			
 
				-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {

			
 
				-    // bitonic sort

			
 
				-    int col = threadIdx.x;

			
 
				-    int row = blockIdx.y;

			
 
				-

			
 
				-    if (col >= ncols_pad) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const float * x_row = x + row * ncols;

			
 
				-    extern __shared__ int dst_row[];

			
 
				-

			
 
				-    // initialize indices

			
 
				-    dst_row[col] = col;

			
 
				-

			
 
				-    __syncthreads();

			
 
				-

			
 
				-    for (int k = 2; k <= ncols_pad; k *= 2) {

			
 
				-        for (int j = k / 2; j > 0; j /= 2) {

			
 
				-            int ixj = col ^ j;

			
 
				-            if (ixj > col) {

			
 
				-                if ((col & k) == 0) {

			
 
				-                    if (dst_row[col] >= ncols ||

			
 
				-                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?

			
 
				-                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :

			
 
				-                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))

			
 
				-                    ) {

			
 
				-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);

			
 
				-                    }

			
 
				-                } else {

			
 
				-                    if (dst_row[ixj] >= ncols ||

			
 
				-                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?

			
 
				-                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :

			
 
				-                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))

			
 
				-                    ) {

			
 
				-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);

			
 
				-                    }

			
 
				-                }

			
 
				-            }

			
 
				-            __syncthreads();

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // copy the result to dst without the padding

			
 
				-    if (col < ncols) {

			
 
				-        dst[row * ncols + col] = dst_row[col];

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static int next_power_of_2(int x) {

			
 
				-    int n = 1;

			
 
				-    while (n < x) {

			
 
				-        n *= 2;

			
 
				-    }

			
 
				-    return n;

			
 
				-}

			
 
				-

			
 
				-static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {

			
 
				-    // bitonic sort requires ncols to be power of 2

			
 
				-    const int ncols_pad = next_power_of_2(ncols);

			
 
				-

			
 
				-    const dim3 block_dims(ncols_pad, 1, 1);

			
 
				-    const dim3 block_nums(1, nrows, 1);

			
 
				-    const size_t shared_mem = ncols_pad * sizeof(int);

			
 
				-

			
 
				-    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);

			
 
				-

			
 
				-    if (order == GGML_SORT_ORDER_ASC) {

			
 
				-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);

			
 
				-    } else if (order == GGML_SORT_ORDER_DESC) {

			
 
				-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);

			
 
				-    } else {

			
 
				-        GGML_ASSERT(false);

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_I32);

			
 
				-    GGML_ASSERT(ggml_is_contiguous(src0));

			
 
				-

			
 
				-    const int64_t ncols = src0->ne[0];

			
 
				-    const int64_t nrows = ggml_nrows(src0);

			
 
				-

			
 
				-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

			
 
				-

			
 
				-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);

			
 
				-}

			
 
				+#include "argsort.cuh"
			
 
				+
			
 
				+template<typename T>
			
 
				+static inline __device__ void ggml_cuda_swap(T & a, T & b) {
			
 
				+    T tmp = a;
			
 
				+    a = b;
			
 
				+    b = tmp;
			
 
				+}
			
 
				+
			
 
				+template<ggml_sort_order order>
			
 
				+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
			
 
				+    // bitonic sort
			
 
				+    int col = threadIdx.x;
			
 
				+    int row = blockIdx.y;
			
 
				+
			
 
				+    if (col >= ncols_pad) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const float * x_row = x + row * ncols;
			
 
				+    extern __shared__ int dst_row[];
			
 
				+
			
 
				+    // initialize indices
			
 
				+    dst_row[col] = col;
			
 
				+
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    for (int k = 2; k <= ncols_pad; k *= 2) {
			
 
				+        for (int j = k / 2; j > 0; j /= 2) {
			
 
				+            int ixj = col ^ j;
			
 
				+            if (ixj > col) {
			
 
				+                if ((col & k) == 0) {
			
 
				+                    if (dst_row[col] >= ncols ||
			
 
				+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
			
 
				+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
			
 
				+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
			
 
				+                    ) {
			
 
				+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    if (dst_row[ixj] >= ncols ||
			
 
				+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
			
 
				+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
			
 
				+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
			
 
				+                    ) {
			
 
				+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            __syncthreads();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // copy the result to dst without the padding
			
 
				+    if (col < ncols) {
			
 
				+        dst[row * ncols + col] = dst_row[col];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static int next_power_of_2(int x) {
			
 
				+    int n = 1;
			
 
				+    while (n < x) {
			
 
				+        n *= 2;
			
 
				+    }
			
 
				+    return n;
			
 
				+}
			
 
				+
			
 
				+static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
			
 
				+    // bitonic sort requires ncols to be power of 2
			
 
				+    const int ncols_pad = next_power_of_2(ncols);
			
 
				+
			
 
				+    const dim3 block_dims(ncols_pad, 1, 1);
			
 
				+    const dim3 block_nums(1, nrows, 1);
			
 
				+    const size_t shared_mem = ncols_pad * sizeof(int);
			
 
				+
			
 
				+    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
			
 
				+
			
 
				+    if (order == GGML_SORT_ORDER_ASC) {
			
 
				+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
			
 
				+    } else if (order == GGML_SORT_ORDER_DESC) {
			
 
				+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
			
 
				+    } else {
			
 
				+        GGML_ASSERT(false);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+
			
 
				+    const int64_t ncols = src0->ne[0];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
			
 
				+
			
 
				+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,3 +1,3 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,280 +1,280 @@
 
				-#include "binbcast.cuh"

			
 
				-

			
 
				-static __device__ __forceinline__ float op_repeat(const float a, const float b) {

			
 
				-    return b;

			
 
				-    GGML_UNUSED(a);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float op_add(const float a, const float b) {

			
 
				-    return a + b;

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float op_mul(const float a, const float b) {

			
 
				-    return a * b;

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float op_div(const float a, const float b) {

			
 
				-    return a / b;

			
 
				-}

			
 
				-

			
 
				-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>

			
 
				-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,

			
 
				-        int ne0, int ne1, int ne2, int ne3,

			
 
				-        int ne10, int ne11, int ne12, int ne13,

			
 
				-        /*int s0, */ int s1,  int s2,  int s3,

			
 
				-        /*int s00,*/ int s01, int s02, int s03,

			
 
				-        /*int s10,*/ int s11, int s12, int s13) {

			
 
				-    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);

			
 
				-    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;

			
 
				-    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;

			
 
				-

			
 
				-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int i11 = i1 % ne11;

			
 
				-    const int i12 = i2 % ne12;

			
 
				-    const int i13 = i3 % ne13;

			
 
				-

			
 
				-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;

			
 
				-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;

			
 
				-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

			
 
				-

			
 
				-    const src0_t * src0_row = src0 + i_src0;

			
 
				-    const src1_t * src1_row = src1 + i_src1;

			
 
				-    dst_t * dst_row = dst + i_dst;

			
 
				-

			
 
				-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {

			
 
				-        const int i10 = i0 % ne10;

			
 
				-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>

			
 
				-static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,

			
 
				-        int ne0, int ne1, int ne2, int ne3,

			
 
				-        int ne10, int ne11, int ne12, int ne13,

			
 
				-        /*int s0, */ int s1,  int s2,  int s3,

			
 
				-        /*int s00,*/ int s01, int s02, int s03,

			
 
				-        /*int s10,*/ int s11, int s12, int s13) {

			
 
				-

			
 
				-    const int i = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-

			
 
				-    const int i3 = i/(ne2*ne1*ne0);

			
 
				-    const int i2 = (i/(ne1*ne0)) % ne2;

			
 
				-    const int i1 = (i/ne0) % ne1;

			
 
				-    const int i0 = i % ne0;

			
 
				-

			
 
				-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int i11 = i1 % ne11;

			
 
				-    const int i12 = i2 % ne12;

			
 
				-    const int i13 = i3 % ne13;

			
 
				-

			
 
				-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;

			
 
				-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;

			
 
				-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

			
 
				-

			
 
				-    const src0_t * src0_row = src0 + i_src0;

			
 
				-    const src1_t * src1_row = src1 + i_src1;

			
 
				-    dst_t * dst_row = dst + i_dst;

			
 
				-

			
 
				-    const int i10 = i0 % ne10;

			
 
				-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);

			
 
				-}

			
 
				-

			
 
				-template<float (*bin_op)(const float, const float)>

			
 
				-struct bin_bcast_cuda {

			
 
				-    template<typename src0_t, typename src1_t, typename dst_t>

			
 
				-    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,

			
 
				-            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,

			
 
				-            cudaStream_t stream) {

			
 
				-

			
 
				-        GGML_TENSOR_BINARY_OP_LOCALS

			
 
				-

			
 
				-        int nr0 = ne10/ne0;

			
 
				-        int nr1 = ne11/ne1;

			
 
				-        int nr2 = ne12/ne2;

			
 
				-        int nr3 = ne13/ne3;

			
 
				-

			
 
				-        int nr[4] = { nr0, nr1, nr2, nr3 };

			
 
				-

			
 
				-        // collapse dimensions until first broadcast dimension

			
 
				-        int64_t cne[] = {ne0, ne1, ne2, ne3};

			
 
				-        int64_t cne0[] = {ne00, ne01, ne02, ne03};

			
 
				-        int64_t cne1[] = {ne10, ne11, ne12, ne13};

			
 
				-

			
 
				-        size_t cnb[] = {nb0, nb1, nb2, nb3};

			
 
				-        size_t cnb0[] = {nb00, nb01, nb02, nb03};

			
 
				-        size_t cnb1[] = {nb10, nb11, nb12, nb13};

			
 
				-

			
 
				-        auto collapse = [](int64_t cne[]) {

			
 
				-            cne[0] *= cne[1];

			
 
				-            cne[1] = cne[2];

			
 
				-            cne[2] = cne[3];

			
 
				-            cne[3] = 1;

			
 
				-        };

			
 
				-

			
 
				-        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {

			
 
				-            cnb[1] *= cne[1];

			
 
				-            cnb[2] *= cne[2];

			
 
				-            cnb[3] *= cne[3];

			
 
				-        };

			
 
				-

			
 
				-        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {

			
 
				-            for (int i = 0; i < 4; i++) {

			
 
				-                if (nr[i] != 1) {

			
 
				-                    break;

			
 
				-                }

			
 
				-                if (i > 0) {

			
 
				-                    collapse_nb(cnb, cne);

			
 
				-                    collapse_nb(cnb0, cne0);

			
 
				-                    collapse_nb(cnb1, cne1);

			
 
				-                    collapse(cne);

			
 
				-                    collapse(cne0);

			
 
				-                    collapse(cne1);

			
 
				-                }

			
 
				-            }

			
 
				-        }

			
 
				-

			
 
				-        {

			
 
				-            int64_t ne0 = cne[0];

			
 
				-            int64_t ne1 = cne[1];

			
 
				-            int64_t ne2 = cne[2];

			
 
				-            int64_t ne3 = cne[3];

			
 
				-

			
 
				-            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);

			
 
				-            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);

			
 
				-            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);

			
 
				-            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);

			
 
				-

			
 
				-            int64_t ne10 = cne1[0];

			
 
				-            int64_t ne11 = cne1[1];

			
 
				-            int64_t ne12 = cne1[2];

			
 
				-            int64_t ne13 = cne1[3];

			
 
				-

			
 
				-            size_t nb0 = cnb[0];

			
 
				-            size_t nb1 = cnb[1];

			
 
				-            size_t nb2 = cnb[2];

			
 
				-            size_t nb3 = cnb[3];

			
 
				-

			
 
				-            size_t nb00 = cnb0[0];

			
 
				-            size_t nb01 = cnb0[1];

			
 
				-            size_t nb02 = cnb0[2];

			
 
				-            size_t nb03 = cnb0[3];

			
 
				-

			
 
				-            size_t nb10 = cnb1[0];

			
 
				-            size_t nb11 = cnb1[1];

			
 
				-            size_t nb12 = cnb1[2];

			
 
				-            size_t nb13 = cnb1[3];

			
 
				-

			
 
				-            size_t s0 = nb0 / sizeof(dst_t);

			
 
				-            size_t s1 = nb1 / sizeof(dst_t);

			
 
				-            size_t s2 = nb2 / sizeof(dst_t);

			
 
				-            size_t s3 = nb3 / sizeof(dst_t);

			
 
				-

			
 
				-            size_t s10 = nb10 / sizeof(src1_t);

			
 
				-            size_t s11 = nb11 / sizeof(src1_t);

			
 
				-            size_t s12 = nb12 / sizeof(src1_t);

			
 
				-            size_t s13 = nb13 / sizeof(src1_t);

			
 
				-

			
 
				-            size_t s00 = nb00 / sizeof(src0_t);

			
 
				-            size_t s01 = nb01 / sizeof(src0_t);

			
 
				-            size_t s02 = nb02 / sizeof(src0_t);

			
 
				-            size_t s03 = nb03 / sizeof(src0_t);

			
 
				-

			
 
				-            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);

			
 
				-            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);

			
 
				-            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);

			
 
				-            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);

			
 
				-

			
 
				-            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);

			
 
				-            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);

			
 
				-            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);

			
 
				-            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);

			
 
				-

			
 
				-            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);

			
 
				-            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);

			
 
				-            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);

			
 
				-            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

			
 
				-

			
 
				-            GGML_ASSERT(s0 == 1);

			
 
				-            GGML_ASSERT(s00 == 1);

			
 
				-            GGML_ASSERT(s10 == 1);

			
 
				-

			
 
				-            const int block_size = 128;

			
 
				-

			
 
				-            int64_t hne0 = std::max(ne0/2LL, 1LL);

			
 
				-

			
 
				-            dim3 block_dims;

			
 
				-            block_dims.x = std::min<unsigned int>(hne0, block_size);

			
 
				-            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);

			
 
				-            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);

			
 
				-

			
 
				-            dim3 block_nums(

			
 
				-                (hne0 + block_dims.x - 1) / block_dims.x,

			
 
				-                (ne1 + block_dims.y - 1) / block_dims.y,

			
 
				-                (ne2*ne3 + block_dims.z - 1) / block_dims.z

			
 
				-            );

			
 
				-

			
 
				-            if (block_nums.z > 65535) {

			
 
				-                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel

			
 
				-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;

			
 
				-                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(

			
 
				-                    src0_dd, src1_dd, dst_dd,

			
 
				-                    ne0, ne1, ne2, ne3,

			
 
				-                    ne10, ne11, ne12, ne13,

			
 
				-                    /* s0, */ s1, s2, s3,

			
 
				-                    /* s00, */ s01, s02, s03,

			
 
				-                    /* s10, */ s11, s12, s13);

			
 
				-            } else {

			
 
				-                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(

			
 
				-                    src0_dd, src1_dd, dst_dd,

			
 
				-                    ne0, ne1, ne2, ne3,

			
 
				-                    ne10, ne11, ne12, ne13,

			
 
				-                    /* s0, */ s1, s2, s3,

			
 
				-                    /* s00, */ s01, s02, s03,

			
 
				-                    /* s10, */ s11, s12, s13);

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-};

			
 
				-

			
 
				-template<class op>

			
 
				-static void ggml_cuda_op_bin_bcast(

			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

			
 
				-    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {

			
 
				-

			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {

			
 
				-        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);

			
 
				-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {

			
 
				-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);

			
 
				-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {

			
 
				-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);

			
 
				-    } else {

			
 
				-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,

			
 
				-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));

			
 
				-        GGML_ASSERT(false);

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

			
 
				-}

			
 
				+#include "binbcast.cuh"
			
 
				+
			
 
				+static __device__ __forceinline__ float op_repeat(const float a, const float b) {
			
 
				+    return b;
			
 
				+    GGML_UNUSED(a);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float op_add(const float a, const float b) {
			
 
				+    return a + b;
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float op_mul(const float a, const float b) {
			
 
				+    return a * b;
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float op_div(const float a, const float b) {
			
 
				+    return a / b;
			
 
				+}
			
 
				+
			
 
				+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
			
 
				+static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
			
 
				+        int ne0, int ne1, int ne2, int ne3,
			
 
				+        int ne10, int ne11, int ne12, int ne13,
			
 
				+        /*int s0, */ int s1,  int s2,  int s3,
			
 
				+        /*int s00,*/ int s01, int s02, int s03,
			
 
				+        /*int s10,*/ int s11, int s12, int s13) {
			
 
				+    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
			
 
				+    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
			
 
				+    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
			
 
				+
			
 
				+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i11 = i1 % ne11;
			
 
				+    const int i12 = i2 % ne12;
			
 
				+    const int i13 = i3 % ne13;
			
 
				+
			
 
				+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
			
 
				+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
			
 
				+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
			
 
				+
			
 
				+    const src0_t * src0_row = src0 + i_src0;
			
 
				+    const src1_t * src1_row = src1 + i_src1;
			
 
				+    dst_t * dst_row = dst + i_dst;
			
 
				+
			
 
				+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
			
 
				+        const int i10 = i0 % ne10;
			
 
				+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
			
 
				+static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
			
 
				+        int ne0, int ne1, int ne2, int ne3,
			
 
				+        int ne10, int ne11, int ne12, int ne13,
			
 
				+        /*int s0, */ int s1,  int s2,  int s3,
			
 
				+        /*int s00,*/ int s01, int s02, int s03,
			
 
				+        /*int s10,*/ int s11, int s12, int s13) {
			
 
				+
			
 
				+    const int i = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+
			
 
				+    const int i3 = i/(ne2*ne1*ne0);
			
 
				+    const int i2 = (i/(ne1*ne0)) % ne2;
			
 
				+    const int i1 = (i/ne0) % ne1;
			
 
				+    const int i0 = i % ne0;
			
 
				+
			
 
				+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i11 = i1 % ne11;
			
 
				+    const int i12 = i2 % ne12;
			
 
				+    const int i13 = i3 % ne13;
			
 
				+
			
 
				+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
			
 
				+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
			
 
				+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
			
 
				+
			
 
				+    const src0_t * src0_row = src0 + i_src0;
			
 
				+    const src1_t * src1_row = src1 + i_src1;
			
 
				+    dst_t * dst_row = dst + i_dst;
			
 
				+
			
 
				+    const int i10 = i0 % ne10;
			
 
				+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
			
 
				+}
			
 
				+
			
 
				+template<float (*bin_op)(const float, const float)>
			
 
				+struct bin_bcast_cuda {
			
 
				+    template<typename src0_t, typename src1_t, typename dst_t>
			
 
				+    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
			
 
				+            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
			
 
				+            cudaStream_t stream) {
			
 
				+
			
 
				+        GGML_TENSOR_BINARY_OP_LOCALS
			
 
				+
			
 
				+        int nr0 = ne10/ne0;
			
 
				+        int nr1 = ne11/ne1;
			
 
				+        int nr2 = ne12/ne2;
			
 
				+        int nr3 = ne13/ne3;
			
 
				+
			
 
				+        int nr[4] = { nr0, nr1, nr2, nr3 };
			
 
				+
			
 
				+        // collapse dimensions until first broadcast dimension
			
 
				+        int64_t cne[] = {ne0, ne1, ne2, ne3};
			
 
				+        int64_t cne0[] = {ne00, ne01, ne02, ne03};
			
 
				+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
			
 
				+
			
 
				+        size_t cnb[] = {nb0, nb1, nb2, nb3};
			
 
				+        size_t cnb0[] = {nb00, nb01, nb02, nb03};
			
 
				+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
			
 
				+
			
 
				+        auto collapse = [](int64_t cne[]) {
			
 
				+            cne[0] *= cne[1];
			
 
				+            cne[1] = cne[2];
			
 
				+            cne[2] = cne[3];
			
 
				+            cne[3] = 1;
			
 
				+        };
			
 
				+
			
 
				+        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
			
 
				+            cnb[1] *= cne[1];
			
 
				+            cnb[2] *= cne[2];
			
 
				+            cnb[3] *= cne[3];
			
 
				+        };
			
 
				+
			
 
				+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
			
 
				+            for (int i = 0; i < 4; i++) {
			
 
				+                if (nr[i] != 1) {
			
 
				+                    break;
			
 
				+                }
			
 
				+                if (i > 0) {
			
 
				+                    collapse_nb(cnb, cne);
			
 
				+                    collapse_nb(cnb0, cne0);
			
 
				+                    collapse_nb(cnb1, cne1);
			
 
				+                    collapse(cne);
			
 
				+                    collapse(cne0);
			
 
				+                    collapse(cne1);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        {
			
 
				+            int64_t ne0 = cne[0];
			
 
				+            int64_t ne1 = cne[1];
			
 
				+            int64_t ne2 = cne[2];
			
 
				+            int64_t ne3 = cne[3];
			
 
				+
			
 
				+            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
			
 
				+            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
			
 
				+            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
			
 
				+            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
			
 
				+
			
 
				+            int64_t ne10 = cne1[0];
			
 
				+            int64_t ne11 = cne1[1];
			
 
				+            int64_t ne12 = cne1[2];
			
 
				+            int64_t ne13 = cne1[3];
			
 
				+
			
 
				+            size_t nb0 = cnb[0];
			
 
				+            size_t nb1 = cnb[1];
			
 
				+            size_t nb2 = cnb[2];
			
 
				+            size_t nb3 = cnb[3];
			
 
				+
			
 
				+            size_t nb00 = cnb0[0];
			
 
				+            size_t nb01 = cnb0[1];
			
 
				+            size_t nb02 = cnb0[2];
			
 
				+            size_t nb03 = cnb0[3];
			
 
				+
			
 
				+            size_t nb10 = cnb1[0];
			
 
				+            size_t nb11 = cnb1[1];
			
 
				+            size_t nb12 = cnb1[2];
			
 
				+            size_t nb13 = cnb1[3];
			
 
				+
			
 
				+            size_t s0 = nb0 / sizeof(dst_t);
			
 
				+            size_t s1 = nb1 / sizeof(dst_t);
			
 
				+            size_t s2 = nb2 / sizeof(dst_t);
			
 
				+            size_t s3 = nb3 / sizeof(dst_t);
			
 
				+
			
 
				+            size_t s10 = nb10 / sizeof(src1_t);
			
 
				+            size_t s11 = nb11 / sizeof(src1_t);
			
 
				+            size_t s12 = nb12 / sizeof(src1_t);
			
 
				+            size_t s13 = nb13 / sizeof(src1_t);
			
 
				+
			
 
				+            size_t s00 = nb00 / sizeof(src0_t);
			
 
				+            size_t s01 = nb01 / sizeof(src0_t);
			
 
				+            size_t s02 = nb02 / sizeof(src0_t);
			
 
				+            size_t s03 = nb03 / sizeof(src0_t);
			
 
				+
			
 
				+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
			
 
				+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
			
 
				+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
			
 
				+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
			
 
				+
			
 
				+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
			
 
				+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
			
 
				+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
			
 
				+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
			
 
				+
			
 
				+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
			
 
				+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
			
 
				+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
			
 
				+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
			
 
				+
			
 
				+            GGML_ASSERT(s0 == 1);
			
 
				+            GGML_ASSERT(s00 == 1);
			
 
				+            GGML_ASSERT(s10 == 1);
			
 
				+
			
 
				+            const int block_size = 128;
			
 
				+
			
 
				+            int64_t hne0 = std::max(ne0/2LL, 1LL);
			
 
				+
			
 
				+            dim3 block_dims;
			
 
				+            block_dims.x = std::min<unsigned int>(hne0, block_size);
			
 
				+            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
			
 
				+            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
			
 
				+
			
 
				+            dim3 block_nums(
			
 
				+                (hne0 + block_dims.x - 1) / block_dims.x,
			
 
				+                (ne1 + block_dims.y - 1) / block_dims.y,
			
 
				+                (ne2*ne3 + block_dims.z - 1) / block_dims.z
			
 
				+            );
			
 
				+
			
 
				+            if (block_nums.z > 65535) {
			
 
				+                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
			
 
				+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
			
 
				+                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
			
 
				+                    src0_dd, src1_dd, dst_dd,
			
 
				+                    ne0, ne1, ne2, ne3,
			
 
				+                    ne10, ne11, ne12, ne13,
			
 
				+                    /* s0, */ s1, s2, s3,
			
 
				+                    /* s00, */ s01, s02, s03,
			
 
				+                    /* s10, */ s11, s12, s13);
			
 
				+            } else {
			
 
				+                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
			
 
				+                    src0_dd, src1_dd, dst_dd,
			
 
				+                    ne0, ne1, ne2, ne3,
			
 
				+                    ne10, ne11, ne12, ne13,
			
 
				+                    /* s0, */ s1, s2, s3,
			
 
				+                    /* s00, */ s01, s02, s03,
			
 
				+                    /* s10, */ s11, s12, s13);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<class op>
			
 
				+static void ggml_cuda_op_bin_bcast(
			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
 
				+    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
			
 
				+
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
			
 
				+        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
			
 
				+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
			
 
				+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
			
 
				+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
			
 
				+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
			
 
				+    } else {
			
 
				+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
			
 
				+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
			
 
				+        GGML_ASSERT(false);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				+}
			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,6 +1,6 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_CLAMP_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_CLAMP_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,49 +1,49 @@
 
				-#include "concat.cuh"

			
 
				-

			
 
				-static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {

			
 
				-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				-    if (nidx >= ne0) {

			
 
				-        return;

			
 
				-    }

			
 
				-    // operation

			
 
				-    int offset_dst =

			
 
				-        nidx +

			
 
				-        blockIdx.y * ne0 +

			
 
				-        blockIdx.z * ne0 * gridDim.y;

			
 
				-    if (blockIdx.z < ne02) { // src0

			
 
				-        int offset_src =

			
 
				-            nidx +

			
 
				-            blockIdx.y * ne0 +

			
 
				-            blockIdx.z * ne0 * gridDim.y;

			
 
				-        dst[offset_dst] = x[offset_src];

			
 
				-    } else {

			
 
				-        int offset_src =

			
 
				-            nidx +

			
 
				-            blockIdx.y * ne0 +

			
 
				-            (blockIdx.z - ne02) * ne0 *  gridDim.y;

			
 
				-        dst[offset_dst] = y[offset_src];

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {

			
 
				-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;

			
 
				-    dim3 gridDim(num_blocks, ne1, ne2);

			
 
				-    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const ggml_tensor * src1 = dst->src[1];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    const float * src1_d = (const float *)src1->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {

			
 
				-        concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);

			
 
				-    }

			
 
				-}

			
 
				+#include "concat.cuh"
			
 
				+
			
 
				+static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
			
 
				+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if (nidx >= ne0) {
			
 
				+        return;
			
 
				+    }
			
 
				+    // operation
			
 
				+    int offset_dst =
			
 
				+        nidx +
			
 
				+        blockIdx.y * ne0 +
			
 
				+        blockIdx.z * ne0 * gridDim.y;
			
 
				+    if (blockIdx.z < ne02) { // src0
			
 
				+        int offset_src =
			
 
				+            nidx +
			
 
				+            blockIdx.y * ne0 +
			
 
				+            blockIdx.z * ne0 * gridDim.y;
			
 
				+        dst[offset_dst] = x[offset_src];
			
 
				+    } else {
			
 
				+        int offset_src =
			
 
				+            nidx +
			
 
				+            blockIdx.y * ne0 +
			
 
				+            (blockIdx.z - ne02) * ne0 *  gridDim.y;
			
 
				+        dst[offset_dst] = y[offset_src];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
			
 
				+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
			
 
				+    dim3 gridDim(num_blocks, ne1, ne2);
			
 
				+    concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    const float * src1_d = (const float *)src1->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
			
 
				+        concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
			
 
				+    }
			
 
				+}
			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_CONCAT_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_CONCAT_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,13 +1,13 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256

			
 
				-

			
 
				-template<typename T>

			
 
				-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);

			
 
				-

			
 
				-typedef to_t_cuda_t<float> to_fp32_cuda_t;

			
 
				-typedef to_t_cuda_t<half> to_fp16_cuda_t;

			
 
				-

			
 
				-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);

			
 
				-

			
 
				-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
			
 
				+
			
 
				+template<typename T>
			
 
				+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
			
 
				+
			
 
				+typedef to_t_cuda_t<float> to_fp32_cuda_t;
			
 
				+typedef to_t_cuda_t<half> to_fp16_cuda_t;
			
 
				+
			
 
				+to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
			
 
				+
			
 
				+to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
			
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,103 +1,103 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				-    const block_q4_0 * x = (const block_q4_0 *) vx;

			
 
				-

			
 
				-    const dfloat d = x[ib].d;

			
 
				-

			
 
				-    const int vui = x[ib].qs[iqs];

			
 
				-

			
 
				-    v.x = vui & 0xF;

			
 
				-    v.y = vui >> 4;

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    v = __hsub2(v, {8.0f, 8.0f});

			
 
				-    v = __hmul2(v, {d, d});

			
 
				-#else

			
 
				-    v.x = (v.x - 8.0f) * d;

			
 
				-    v.y = (v.y - 8.0f) * d;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				-    const block_q4_1 * x = (const block_q4_1 *) vx;

			
 
				-

			
 
				-    const dfloat d = __low2half(x[ib].dm);

			
 
				-    const dfloat m = __high2half(x[ib].dm);

			
 
				-

			
 
				-    const int vui = x[ib].qs[iqs];

			
 
				-

			
 
				-    v.x = vui & 0xF;

			
 
				-    v.y = vui >> 4;

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    v = __hmul2(v, {d, d});

			
 
				-    v = __hadd2(v, {m, m});

			
 
				-#else

			
 
				-    v.x = (v.x * d) + m;

			
 
				-    v.y = (v.y * d) + m;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				-    const block_q5_0 * x = (const block_q5_0 *) vx;

			
 
				-

			
 
				-    const dfloat d = x[ib].d;

			
 
				-

			
 
				-    uint32_t qh;

			
 
				-    memcpy(&qh, x[ib].qh, sizeof(qh));

			
 
				-

			
 
				-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;

			
 
				-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

			
 
				-

			
 
				-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);

			
 
				-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    v = __hsub2(v, {16.0f, 16.0f});

			
 
				-    v = __hmul2(v, {d, d});

			
 
				-#else

			
 
				-    v.x = (v.x - 16.0f) * d;

			
 
				-    v.y = (v.y - 16.0f) * d;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				-    const block_q5_1 * x = (const block_q5_1 *) vx;

			
 
				-

			
 
				-    const dfloat d = __low2half(x[ib].dm);

			
 
				-    const dfloat m = __high2half(x[ib].dm);

			
 
				-

			
 
				-    uint32_t qh;

			
 
				-    memcpy(&qh, x[ib].qh, sizeof(qh));

			
 
				-

			
 
				-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;

			
 
				-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

			
 
				-

			
 
				-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);

			
 
				-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    v = __hmul2(v, {d, d});

			
 
				-    v = __hadd2(v, {m, m});

			
 
				-#else

			
 
				-    v.x = (v.x * d) + m;

			
 
				-    v.y = (v.y * d) + m;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				-    const block_q8_0 * x = (const block_q8_0 *) vx;

			
 
				-

			
 
				-    const dfloat d = x[ib].d;

			
 
				-

			
 
				-    v.x = x[ib].qs[iqs + 0];

			
 
				-    v.y = x[ib].qs[iqs + 1];

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    v = __hmul2(v, {d, d});

			
 
				-#else

			
 
				-    v.x *= d;

			
 
				-    v.y *= d;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-}

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				+    const block_q4_0 * x = (const block_q4_0 *) vx;
			
 
				+
			
 
				+    const dfloat d = x[ib].d;
			
 
				+
			
 
				+    const int vui = x[ib].qs[iqs];
			
 
				+
			
 
				+    v.x = vui & 0xF;
			
 
				+    v.y = vui >> 4;
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    v = __hsub2(v, {8.0f, 8.0f});
			
 
				+    v = __hmul2(v, {d, d});
			
 
				+#else
			
 
				+    v.x = (v.x - 8.0f) * d;
			
 
				+    v.y = (v.y - 8.0f) * d;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				+    const block_q4_1 * x = (const block_q4_1 *) vx;
			
 
				+
			
 
				+    const dfloat d = __low2half(x[ib].dm);
			
 
				+    const dfloat m = __high2half(x[ib].dm);
			
 
				+
			
 
				+    const int vui = x[ib].qs[iqs];
			
 
				+
			
 
				+    v.x = vui & 0xF;
			
 
				+    v.y = vui >> 4;
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    v = __hmul2(v, {d, d});
			
 
				+    v = __hadd2(v, {m, m});
			
 
				+#else
			
 
				+    v.x = (v.x * d) + m;
			
 
				+    v.y = (v.y * d) + m;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				+    const block_q5_0 * x = (const block_q5_0 *) vx;
			
 
				+
			
 
				+    const dfloat d = x[ib].d;
			
 
				+
			
 
				+    uint32_t qh;
			
 
				+    memcpy(&qh, x[ib].qh, sizeof(qh));
			
 
				+
			
 
				+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
			
 
				+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
			
 
				+
			
 
				+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
			
 
				+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    v = __hsub2(v, {16.0f, 16.0f});
			
 
				+    v = __hmul2(v, {d, d});
			
 
				+#else
			
 
				+    v.x = (v.x - 16.0f) * d;
			
 
				+    v.y = (v.y - 16.0f) * d;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				+    const block_q5_1 * x = (const block_q5_1 *) vx;
			
 
				+
			
 
				+    const dfloat d = __low2half(x[ib].dm);
			
 
				+    const dfloat m = __high2half(x[ib].dm);
			
 
				+
			
 
				+    uint32_t qh;
			
 
				+    memcpy(&qh, x[ib].qh, sizeof(qh));
			
 
				+
			
 
				+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
			
 
				+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
			
 
				+
			
 
				+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
			
 
				+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    v = __hmul2(v, {d, d});
			
 
				+    v = __hadd2(v, {m, m});
			
 
				+#else
			
 
				+    v.x = (v.x * d) + m;
			
 
				+    v.y = (v.y * d) + m;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				+    const block_q8_0 * x = (const block_q8_0 *) vx;
			
 
				+
			
 
				+    const dfloat d = x[ib].d;
			
 
				+
			
 
				+    v.x = x[ib].qs[iqs + 0];
			
 
				+    v.y = x[ib].qs[iqs + 1];
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    v = __hmul2(v, {d, d});
			
 
				+#else
			
 
				+    v.x *= d;
			
 
				+    v.y *= d;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+}
			
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,40 +1,40 @@
 
				-#include "diagmask.cuh"

			
 
				-

			
 
				-static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {

			
 
				-    const int col = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				-    const int row = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-

			
 
				-    if (col >= ncols) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int i = row*ncols + col;

			
 
				-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];

			
 
				-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU

			
 
				-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;

			
 
				-}

			
 
				-

			
 
				-static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {

			
 
				-    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);

			
 
				-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;

			
 
				-    const dim3 block_nums(nrows_x, block_num_x, 1);

			
 
				-    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    const int64_t ne00 = src0->ne[0];

			
 
				-    const int64_t ne01 = src0->ne[1];

			
 
				-    const int nrows0 = ggml_nrows(src0);

			
 
				-

			
 
				-    const int n_past = ((int32_t *) dst->op_params)[0];

			
 
				-

			
 
				-    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);

			
 
				-}

			
 
				+#include "diagmask.cuh"
			
 
				+
			
 
				+static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
			
 
				+    const int col = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+
			
 
				+    if (col >= ncols) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i = row*ncols + col;
			
 
				+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
			
 
				+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
			
 
				+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
			
 
				+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
			
 
				+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
			
 
				+    const dim3 block_nums(nrows_x, block_num_x, 1);
			
 
				+    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    const int64_t ne00 = src0->ne[0];
			
 
				+    const int64_t ne01 = src0->ne[1];
			
 
				+    const int nrows0 = ggml_nrows(src0);
			
 
				+
			
 
				+    const int n_past = ((int32_t *) dst->op_params)[0];
			
 
				+
			
 
				+    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32

			
 
				-

			
 
				-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
			
 
				+
			
 
				+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/dmmv.cu
+++ b/llama/ggml-cuda/dmmv.cu
@@ -1,813 +1,813 @@
 
				-#include "dmmv.cuh"

			
 
				-#include "dequantize.cuh"

			
 
				-#include "convert.cuh"

			
 
				-

			
 
				-#ifndef K_QUANTS_PER_ITERATION

			
 
				-#define K_QUANTS_PER_ITERATION 2

			
 
				-#else

			
 
				-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");

			
 
				-#endif

			
 
				-

			
 
				-static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

			
 
				-

			
 
				-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

			
 
				-

			
 
				-    const int row = blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-    if (row > nrows) return;

			
 
				-

			
 
				-    const int num_blocks_per_row = ncols / QK_K;

			
 
				-    const int ib0 = row*num_blocks_per_row;

			
 
				-

			
 
				-    const block_q2_K * x = (const block_q2_K *)vx + ib0;

			
 
				-

			
 
				-    float tmp = 0; // partial sum for thread in warp

			
 
				-

			
 
				-#if QK_K == 256

			
 
				-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15

			
 
				-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

			
 
				-

			
 
				-    const int step = 16/K_QUANTS_PER_ITERATION;

			
 
				-

			
 
				-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...

			
 
				-    const int in = tid - step*im;                        // 0...15 or 0...7

			
 
				-

			
 
				-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2

			
 
				-    const int q_offset = 32*im + l0;

			
 
				-    const int s_offset = 8*im;

			
 
				-    const int y_offset = 128*im + l0;

			
 
				-

			
 
				-    uint32_t aux[4];

			
 
				-    const uint8_t * d = (const uint8_t *)aux;

			
 
				-    const uint8_t * m = (const uint8_t *)(aux + 2);

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y = yy + i * QK_K + y_offset;

			
 
				-        const uint8_t * q = x[i].qs + q_offset;

			
 
				-

			
 
				-        const float dall = __low2half(x[i].dm);

			
 
				-        const float dmin = __high2half(x[i].dm);

			
 
				-

			
 
				-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);

			
 
				-        aux[0] = a[0] & 0x0f0f0f0f;

			
 
				-        aux[1] = a[1] & 0x0f0f0f0f;

			
 
				-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;

			
 
				-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;

			
 
				-

			
 
				-        float sum1 = 0, sum2 = 0;

			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {

			
 
				-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)

			
 
				-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)

			
 
				-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)

			
 
				-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)

			
 
				-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)

			
 
				-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)

			
 
				-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)

			
 
				-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);

			
 
				-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]

			
 
				-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];

			
 
				-

			
 
				-        }

			
 
				-        tmp += dall * sum1 - dmin * sum2;

			
 
				-

			
 
				-    }

			
 
				-#else

			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7

			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3

			
 
				-    const int offset = tid * K_QUANTS_PER_ITERATION;

			
 
				-

			
 
				-    uint32_t uaux[2];

			
 
				-    const uint8_t * d = (const uint8_t *)uaux;

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y = yy + i * QK_K + offset;

			
 
				-        const uint8_t * q = x[i].qs + offset;

			
 
				-        const uint32_t * s = (const uint32_t *)x[i].scales;

			
 
				-

			
 
				-        uaux[0] = s[0] & 0x0f0f0f0f;

			
 
				-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;

			
 
				-

			
 
				-        const float2 dall = __half22float2(x[i].dm);

			
 
				-

			
 
				-        float sum1 = 0, sum2 = 0;

			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {

			
 
				-            const uint8_t ql = q[l];

			
 
				-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)

			
 
				-                  + y[l+16] * d[1] * ((ql >> 2) & 3)

			
 
				-                  + y[l+32] * d[2] * ((ql >> 4) & 3)

			
 
				-                  + y[l+48] * d[3] * ((ql >> 6) & 3);

			
 
				-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];

			
 
				-        }

			
 
				-        tmp += dall.x * sum1 - dall.y * sum2;

			
 
				-    }

			
 
				-#endif

			
 
				-

			
 
				-    // sum up partial sums and write back result

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-

			
 
				-    if (threadIdx.x == 0) {

			
 
				-        dst[row] = tmp;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

			
 
				-

			
 
				-    const int row = blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-    if (row > nrows) return;

			
 
				-

			
 
				-    const int num_blocks_per_row = ncols / QK_K;

			
 
				-    const int ib0 = row*num_blocks_per_row;

			
 
				-

			
 
				-    const block_q3_K * x = (const block_q3_K *)vx + ib0;

			
 
				-

			
 
				-    float tmp = 0; // partial sum for thread in warp

			
 
				-

			
 
				-#if QK_K == 256

			
 
				-

			
 
				-    const uint16_t kmask1 = 0x0303;

			
 
				-    const uint16_t kmask2 = 0x0f0f;

			
 
				-

			
 
				-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16

			
 
				-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

			
 
				-

			
 
				-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop

			
 
				-    const int step = 16/K_QUANTS_PER_ITERATION;

			
 
				-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...

			
 
				-    const int in = tid - step*im;                        // 0....15 or 0...7

			
 
				-

			
 
				-    const uint8_t m = 1 << (4*im);

			
 
				-

			
 
				-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2

			
 
				-    const int q_offset =  32*im + l0;

			
 
				-    const int y_offset = 128*im + l0;

			
 
				-

			
 
				-    uint16_t utmp[4];

			
 
				-    const int8_t * s = (const int8_t *)utmp;

			
 
				-

			
 
				-    const uint16_t s_shift = 4*im;

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y  = yy + i * QK_K + y_offset;

			
 
				-        const uint8_t * q = x[i].qs + q_offset;

			
 
				-        const uint8_t * h = x[i].hmask + l0;

			
 
				-

			
 
				-        const uint16_t * a = (const uint16_t *)x[i].scales;

			
 
				-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);

			
 
				-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);

			
 
				-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);

			
 
				-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);

			
 
				-

			
 
				-        const float d = x[i].d;

			
 
				-

			
 
				-        float sum = 0;

			
 
				-        for (int l = 0; l < n; ++l) {

			
 
				-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))

			
 
				-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))

			
 
				-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))

			
 
				-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));

			
 
				-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))

			
 
				-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))

			
 
				-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))

			
 
				-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));

			
 
				-        }

			
 
				-        tmp += d * sum;

			
 
				-

			
 
				-    }

			
 
				-#else

			
 
				-

			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7

			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3

			
 
				-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14

			
 
				-    const int in = offset/8;                                 // 0 or 1

			
 
				-    const int im = offset%8;                                 // 0...7

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y = yy + i * QK_K + offset;

			
 
				-        const uint8_t * q = x[i].qs + offset;

			
 
				-        const uint8_t * s = x[i].scales;

			
 
				-

			
 
				-        const float dall = (float)x[i].d;

			
 
				-

			
 
				-        float sum = 0;

			
 
				-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {

			
 
				-            const uint8_t hl = x[i].hmask[im+l] >> in;

			
 
				-            const uint8_t ql = q[l];

			
 
				-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))

			
 
				-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))

			
 
				-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))

			
 
				-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));

			
 
				-        }

			
 
				-        tmp += sum;

			
 
				-    }

			
 
				-#endif

			
 
				-

			
 
				-    // sum up partial sums and write back result

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-

			
 
				-    if (threadIdx.x == 0) {

			
 
				-        dst[row] = tmp;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

			
 
				-

			
 
				-    const int row = blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-    if (row > nrows) return;

			
 
				-    const int num_blocks_per_row = ncols / QK_K;

			
 
				-    const int ib0 = row*num_blocks_per_row;

			
 
				-

			
 
				-    const block_q4_K * x = (const block_q4_K *)vx + ib0;

			
 
				-

			
 
				-#if QK_K == 256

			
 
				-    const uint16_t kmask1 = 0x3f3f;

			
 
				-    const uint16_t kmask2 = 0x0f0f;

			
 
				-    const uint16_t kmask3 = 0xc0c0;

			
 
				-

			
 
				-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16

			
 
				-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1

			
 
				-

			
 
				-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4

			
 
				-

			
 
				-    const int il  = tid/step;                            // 0...3

			
 
				-    const int ir  = tid - step*il;                       // 0...7 or 0...3

			
 
				-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4

			
 
				-

			
 
				-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224

			
 
				-    const int in = il%2;

			
 
				-

			
 
				-    const int l0 = n*(2*ir + in);

			
 
				-    const int q_offset = 32*im + l0;

			
 
				-    const int y_offset = 64*im + l0;

			
 
				-

			
 
				-    uint16_t aux[4];

			
 
				-    const uint8_t * sc = (const uint8_t *)aux;

			
 
				-

			
 
				-#if K_QUANTS_PER_ITERATION == 2

			
 
				-    uint32_t q32[4];

			
 
				-    const uint8_t * q4 = (const uint8_t *)q32;

			
 
				-#else

			
 
				-    uint16_t q16[4];

			
 
				-    const uint8_t * q4 = (const uint8_t *)q16;

			
 
				-#endif

			
 
				-

			
 
				-    float tmp = 0; // partial sum for thread in warp

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y1 = yy + i*QK_K + y_offset;

			
 
				-        const float   * y2 = y1 + 128;

			
 
				-

			
 
				-        const float dall = __low2half(x[i].dm);

			
 
				-        const float dmin = __high2half(x[i].dm);

			
 
				-

			
 
				-        const uint16_t * a = (const uint16_t *)x[i].scales;

			
 
				-        aux[0] = a[im+0] & kmask1;

			
 
				-        aux[1] = a[im+2] & kmask1;

			
 
				-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);

			
 
				-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);

			
 
				-

			
 
				-#if K_QUANTS_PER_ITERATION == 2

			
 
				-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);

			
 
				-        const uint32_t * q2 = q1 + 16;

			
 
				-

			
 
				-        q32[0] = q1[0] & 0x0f0f0f0f;

			
 
				-        q32[1] = q1[0] & 0xf0f0f0f0;

			
 
				-        q32[2] = q2[0] & 0x0f0f0f0f;

			
 
				-        q32[3] = q2[0] & 0xf0f0f0f0;

			
 
				-

			
 
				-        float4 s = {0.f, 0.f, 0.f, 0.f};

			
 
				-        float smin = 0;

			
 
				-        for (int l = 0; l < 4; ++l) {

			
 
				-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];

			
 
				-            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];

			
 
				-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];

			
 
				-        }

			
 
				-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;

			
 
				-#else

			
 
				-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);

			
 
				-        const uint16_t * q2 = q1 + 32;

			
 
				-

			
 
				-        q16[0] = q1[0] & 0x0f0f;

			
 
				-        q16[1] = q1[0] & 0xf0f0;

			
 
				-        q16[2] = q2[0] & 0x0f0f;

			
 
				-        q16[3] = q2[0] & 0xf0f0;

			
 
				-

			
 
				-        float4 s = {0.f, 0.f, 0.f, 0.f};

			
 
				-        float smin = 0;

			
 
				-        for (int l = 0; l < 2; ++l) {

			
 
				-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];

			
 
				-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];

			
 
				-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];

			
 
				-        }

			
 
				-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;

			
 
				-#endif

			
 
				-

			
 
				-    }

			
 
				-#else

			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15

			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);

			
 
				-

			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;

			
 
				-

			
 
				-    uint16_t aux16[2];

			
 
				-    const uint8_t * s = (const uint8_t *)aux16;

			
 
				-

			
 
				-    float tmp = 0;

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

			
 
				-        const uint8_t * q = x[i].qs + step;

			
 
				-        const float   * y = yy + i*QK_K + step;

			
 
				-        const uint16_t * a = (const uint16_t *)x[i].scales;

			
 
				-        aux16[0] = a[0] & 0x0f0f;

			
 
				-        aux16[1] = (a[0] >> 4) & 0x0f0f;

			
 
				-        const float d = (float)x[i].dm[0];

			
 
				-        const float m = (float)x[i].dm[1];

			
 
				-        float sum = 0.f;

			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {

			
 
				-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])

			
 
				-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])

			
 
				-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])

			
 
				-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);

			
 
				-        }

			
 
				-        tmp += sum;

			
 
				-    }

			
 
				-

			
 
				-#endif

			
 
				-

			
 
				-    // sum up partial sums and write back result

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-

			
 
				-    if (tid == 0) {

			
 
				-        dst[row] = tmp;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {

			
 
				-

			
 
				-    const int row = blockIdx.x;

			
 
				-    const int num_blocks_per_row = ncols / QK_K;

			
 
				-    const int ib0 = row*num_blocks_per_row;

			
 
				-

			
 
				-    const block_q5_K * x = (const block_q5_K *)vx + ib0;

			
 
				-

			
 
				-    float tmp = 0; // partial sum for thread in warp

			
 
				-

			
 
				-#if QK_K == 256

			
 
				-    const uint16_t kmask1 = 0x3f3f;

			
 
				-    const uint16_t kmask2 = 0x0f0f;

			
 
				-    const uint16_t kmask3 = 0xc0c0;

			
 
				-

			
 
				-    const int tid = threadIdx.x/2;  // 0...15

			
 
				-    const int ix  = threadIdx.x%2;

			
 
				-

			
 
				-    const int il  = tid/4;     // 0...3

			
 
				-    const int ir  = tid - 4*il;// 0...3

			
 
				-    const int n   = 2;

			
 
				-

			
 
				-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224

			
 
				-    const int in = il%2;

			
 
				-

			
 
				-    const int l0 = n*(2*ir + in);

			
 
				-    const int q_offset = 32*im + l0;

			
 
				-    const int y_offset = 64*im + l0;

			
 
				-

			
 
				-    const uint8_t hm1  = 1 << (2*im);

			
 
				-    const uint8_t hm2  = hm1 << 4;

			
 
				-

			
 
				-    uint16_t aux[4];

			
 
				-    const uint8_t * sc = (const uint8_t *)aux;

			
 
				-

			
 
				-    uint16_t q16[8];

			
 
				-    const uint8_t * q4 = (const uint8_t *)q16;

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2) {

			
 
				-

			
 
				-        const uint8_t * ql1 = x[i].qs + q_offset;

			
 
				-        const uint8_t * qh  = x[i].qh + l0;

			
 
				-        const float   * y1  = yy + i*QK_K + y_offset;

			
 
				-        const float   * y2  = y1 + 128;

			
 
				-

			
 
				-        const float dall = __low2half(x[i].dm);

			
 
				-        const float dmin = __high2half(x[i].dm);

			
 
				-

			
 
				-        const uint16_t * a = (const uint16_t *)x[i].scales;

			
 
				-        aux[0] = a[im+0] & kmask1;

			
 
				-        aux[1] = a[im+2] & kmask1;

			
 
				-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);

			
 
				-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);

			
 
				-

			
 
				-        float4 sum = {0.f, 0.f, 0.f, 0.f};

			
 
				-        float smin = 0;

			
 
				-        const uint16_t * q1 = (const uint16_t *)ql1;

			
 
				-        const uint16_t * q2 = q1 + 32;

			
 
				-        q16[0] = q1[0] & 0x0f0f;

			
 
				-        q16[1] = q1[8] & 0x0f0f;

			
 
				-        q16[2] = (q1[0] >> 4) & 0x0f0f;

			
 
				-        q16[3] = (q1[8] >> 4) & 0x0f0f;

			
 
				-        q16[4] = q2[0] & 0x0f0f;

			
 
				-        q16[5] = q2[8] & 0x0f0f;

			
 
				-        q16[6] = (q2[0] >> 4) & 0x0f0f;

			
 
				-        q16[7] = (q2[8] >> 4) & 0x0f0f;

			
 
				-        for (int l = 0; l < n; ++l) {

			
 
				-            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))

			
 
				-                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));

			
 
				-            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))

			
 
				-                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));

			
 
				-            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))

			
 
				-                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));

			
 
				-            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))

			
 
				-                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));

			
 
				-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]

			
 
				-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];

			
 
				-        }

			
 
				-        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;

			
 
				-    }

			
 
				-

			
 
				-#else

			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15

			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);

			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;

			
 
				-    const int im = step/8;

			
 
				-    const int in = step%8;

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

			
 
				-        const uint8_t * q = x[i].qs + step;

			
 
				-        const int8_t  * s = x[i].scales;

			
 
				-        const float   * y = yy + i*QK_K + step;

			
 
				-        const float     d = x[i].d;

			
 
				-        float sum = 0.f;

			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {

			
 
				-            const uint8_t h = x[i].qh[in+j] >> im;

			
 
				-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))

			
 
				-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))

			
 
				-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))

			
 
				-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));

			
 
				-        }

			
 
				-        tmp += sum;

			
 
				-    }

			
 
				-#endif

			
 
				-

			
 
				-    // sum up partial sums and write back result

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-

			
 
				-    if (threadIdx.x == 0) {

			
 
				-        dst[row] = tmp;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

			
 
				-

			
 
				-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

			
 
				-

			
 
				-    const int row = blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-    if (row > nrows) return;

			
 
				-

			
 
				-    const int num_blocks_per_row = ncols / QK_K;

			
 
				-    const int ib0 = row*num_blocks_per_row;

			
 
				-

			
 
				-    const block_q6_K * x = (const block_q6_K *)vx + ib0;

			
 
				-

			
 
				-#if QK_K == 256

			
 
				-

			
 
				-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16

			
 
				-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1

			
 
				-

			
 
				-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8

			
 
				-

			
 
				-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...

			
 
				-    const int in = tid - step*im;                        // 0...15 or 0...7

			
 
				-

			
 
				-#if K_QUANTS_PER_ITERATION == 1

			
 
				-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15

			
 
				-    const int is = 0;

			
 
				-#else

			
 
				-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28

			
 
				-    const int is = in / 4;

			
 
				-#endif

			
 
				-    const int ql_offset = 64*im + l0;

			
 
				-    const int qh_offset = 32*im + l0;

			
 
				-    const int s_offset  =  8*im + is;

			
 
				-    const int y_offset = 128*im + l0;

			
 
				-

			
 
				-    float tmp = 0; // partial sum for thread in warp

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y  = yy + i * QK_K + y_offset;

			
 
				-        const uint8_t * ql = x[i].ql + ql_offset;

			
 
				-        const uint8_t * qh = x[i].qh + qh_offset;

			
 
				-        const int8_t  * s  = x[i].scales + s_offset;

			
 
				-

			
 
				-        const float d = x[i].d;

			
 
				-

			
 
				-#if K_QUANTS_PER_ITERATION == 1

			
 
				-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)

			
 
				-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)

			
 
				-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)

			
 
				-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)

			
 
				-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)

			
 
				-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)

			
 
				-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)

			
 
				-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);

			
 
				-        tmp += sum;

			
 
				-#else

			
 
				-        float sum = 0;

			
 
				-        for (int l = 0; l < 4; ++l) {

			
 
				-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)

			
 
				-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)

			
 
				-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)

			
 
				-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);

			
 
				-        }

			
 
				-        tmp += sum;

			
 
				-#endif

			
 
				-

			
 
				-    }

			
 
				-

			
 
				-#else

			
 
				-

			
 
				-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7

			
 
				-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3

			
 
				-

			
 
				-    const int step = tid * K_QUANTS_PER_ITERATION;

			
 
				-

			
 
				-    float tmp = 0; // partial sum for thread in warp

			
 
				-

			
 
				-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {

			
 
				-

			
 
				-        const float   * y  = yy + i * QK_K + step;

			
 
				-        const uint8_t * ql = x[i].ql + step;

			
 
				-        const uint8_t * qh = x[i].qh + step;

			
 
				-        const int8_t  * s  = x[i].scales;

			
 
				-

			
 
				-        const float d = x[i+0].d;

			
 
				-

			
 
				-        float sum = 0;

			
 
				-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {

			
 
				-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)

			
 
				-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)

			
 
				-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)

			
 
				-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);

			
 
				-        }

			
 
				-        tmp += sum;

			
 
				-

			
 
				-    }

			
 
				-

			
 
				-#endif

			
 
				-

			
 
				-    // sum up partial sums and write back result

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-

			
 
				-    if (tid == 0) {

			
 
				-        dst[row] = tmp;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				-    const half * x = (const half *) vx;

			
 
				-

			
 
				-    // automatic half -> float type cast if dfloat == float

			
 
				-    v.x = x[ib + iqs + 0];

			
 
				-    v.y = x[ib + iqs + 1];

			
 
				-}

			
 
				-

			
 
				-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>

			
 
				-static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {

			
 
				-    // qk = quantized weights per x block

			
 
				-    // qr = number of quantized weights per data value in x block

			
 
				-    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-

			
 
				-    if (row >= nrows) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int tid = threadIdx.x;

			
 
				-

			
 
				-    const int iter_stride = 2*GGML_CUDA_DMMV_X;

			
 
				-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter

			
 
				-    const int y_offset = qr == 1 ? 1 : qk/2;

			
 
				-

			
 
				-// partial sum for each thread

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics

			
 
				-#else

			
 
				-    float tmp = 0.0f;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-

			
 
				-    for (int i = 0; i < ncols; i += iter_stride) {

			
 
				-        const int col = i + vals_per_iter*tid;

			
 
				-        const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index

			
 
				-        const int iqs = (col%qk)/qr; // x quant index

			
 
				-        const int iybs = col - col%qk; // y block start index

			
 
				-

			
 
				-// processing >2 values per i iter is faster for fast GPUs

			
 
				-#pragma unroll

			
 
				-        for (int j = 0; j < vals_per_iter; j += 2) {

			
 
				-            // process 2 vals per j iter

			
 
				-

			
 
				-            // dequantize

			
 
				-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val

			
 
				-            dfloat2 v;

			
 
				-            dequantize_kernel(vx, ib, iqs + j/qr, v);

			
 
				-

			
 
				-            // matrix multiplication

			
 
				-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-            tmp += __hmul2(v, {

			
 
				-                y[iybs + iqs + j/qr + 0],

			
 
				-                y[iybs + iqs + j/qr + y_offset]

			
 
				-            });

			
 
				-#else

			
 
				-            tmp += v.x * y[iybs + iqs + j/qr + 0];

			
 
				-            tmp += v.y * y[iybs + iqs + j/qr + y_offset];

			
 
				-#endif // GGML_CUDA_F16

			
 
				-        }

			
 
				-    }

			
 
				-

			
 
				-    // sum up partial sums and write back result

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-

			
 
				-    if (tid == 0) {

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-        dst[row] = tmp.x + tmp.y;

			
 
				-#else

			
 
				-        dst[row] = tmp;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);

			
 
				-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;

			
 
				-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);

			
 
				-    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>

			
 
				-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);

			
 
				-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);

			
 
				-    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>

			
 
				-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);

			
 
				-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);

			
 
				-    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>

			
 
				-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);

			
 
				-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);

			
 
				-    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>

			
 
				-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);

			
 
				-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);

			
 
				-    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>

			
 
				-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % QK_K == 0);

			
 
				-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2

			
 
				-    const int block_num_y = (nrows + ny - 1) / ny;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(32, ny, 1);

			
 
				-    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % QK_K == 0);

			
 
				-    const int ny = 2 / K_QUANTS_PER_ITERATION;

			
 
				-    const int block_num_y = (nrows + ny - 1) / ny;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(32, ny, 1);

			
 
				-    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % QK_K == 0);

			
 
				-    const int ny = 2 / K_QUANTS_PER_ITERATION;

			
 
				-    const int block_num_y = (nrows + ny - 1) / ny;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(32, ny, 1);

			
 
				-    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % QK_K == 0);

			
 
				-    const dim3 block_dims(32, 1, 1);

			
 
				-    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);

			
 
				-}

			
 
				-

			
 
				-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % QK_K == 0);

			
 
				-    const int ny = 2 / K_QUANTS_PER_ITERATION;

			
 
				-    const int block_num_y = (nrows + ny - 1) / ny;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(32, ny, 1);

			
 
				-    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);

			
 
				-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;

			
 
				-    const dim3 block_nums(block_num_y, 1, 1);

			
 
				-    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);

			
 
				-    dequantize_mul_mat_vec<1, 1, convert_f16>

			
 
				-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_dequantize_mul_mat_vec(

			
 
				-    ggml_backend_cuda_context & ctx,

			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream) {

			
 
				-    GGML_UNUSED(ctx);

			
 
				-    const int64_t ne00 = src0->ne[0];

			
 
				-    const int64_t row_diff = row_high - row_low;

			
 
				-

			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());

			
 
				-    half * src1_dfloat = nullptr; // dfloat == half

			
 
				-

			
 
				-    bool src1_convert_f16 =

			
 
				-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||

			
 
				-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||

			
 
				-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;

			
 
				-

			
 
				-    if (src1_convert_f16) {

			
 
				-        src1_dfloat = src1_dfloat_a.alloc(ne00);

			
 
				-        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);

			
 
				-        GGML_ASSERT(to_fp16_cuda != nullptr);

			
 
				-        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);

			
 
				-    }

			
 
				-#else

			
 
				-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion

			
 
				-#endif // GGML_CUDA_F16

			
 
				-

			
 
				-    switch (src0->type) {

			
 
				-        case GGML_TYPE_Q4_0:

			
 
				-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q4_1:

			
 
				-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q5_0:

			
 
				-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q5_1:

			
 
				-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q8_0:

			
 
				-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q2_K:

			
 
				-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q3_K:

			
 
				-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q4_K:

			
 
				-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q5_K:

			
 
				-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q6_K:

			
 
				-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_F16:

			
 
				-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);

			
 
				-            break;

			
 
				-        default:

			
 
				-            GGML_ASSERT(false);

			
 
				-            break;

			
 
				-    }

			
 
				-

			
 
				-    GGML_UNUSED(src1);

			
 
				-    GGML_UNUSED(dst);

			
 
				-    GGML_UNUSED(src1_ddq_i);

			
 
				-    GGML_UNUSED(src1_ncols);

			
 
				-    GGML_UNUSED(src1_padded_row_size);

			
 
				-}

			
 
				+#include "dmmv.cuh"
			
 
				+#include "dequantize.cuh"
			
 
				+#include "convert.cuh"
			
 
				+
			
 
				+#ifndef K_QUANTS_PER_ITERATION
			
 
				+#define K_QUANTS_PER_ITERATION 2
			
 
				+#else
			
 
				+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
			
 
				+#endif
			
 
				+
			
 
				+static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
			
 
				+
			
 
				+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
			
 
				+
			
 
				+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+    if (row > nrows) return;
			
 
				+
			
 
				+    const int num_blocks_per_row = ncols / QK_K;
			
 
				+    const int ib0 = row*num_blocks_per_row;
			
 
				+
			
 
				+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
			
 
				+
			
 
				+    float tmp = 0; // partial sum for thread in warp
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
			
 
				+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
			
 
				+
			
 
				+    const int step = 16/K_QUANTS_PER_ITERATION;
			
 
				+
			
 
				+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
			
 
				+    const int in = tid - step*im;                        // 0...15 or 0...7
			
 
				+
			
 
				+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
			
 
				+    const int q_offset = 32*im + l0;
			
 
				+    const int s_offset = 8*im;
			
 
				+    const int y_offset = 128*im + l0;
			
 
				+
			
 
				+    uint32_t aux[4];
			
 
				+    const uint8_t * d = (const uint8_t *)aux;
			
 
				+    const uint8_t * m = (const uint8_t *)(aux + 2);
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y = yy + i * QK_K + y_offset;
			
 
				+        const uint8_t * q = x[i].qs + q_offset;
			
 
				+
			
 
				+        const float dall = __low2half(x[i].dm);
			
 
				+        const float dmin = __high2half(x[i].dm);
			
 
				+
			
 
				+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
			
 
				+        aux[0] = a[0] & 0x0f0f0f0f;
			
 
				+        aux[1] = a[1] & 0x0f0f0f0f;
			
 
				+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
			
 
				+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
			
 
				+
			
 
				+        float sum1 = 0, sum2 = 0;
			
 
				+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
			
 
				+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
			
 
				+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
			
 
				+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
			
 
				+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
			
 
				+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
			
 
				+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
			
 
				+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
			
 
				+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
			
 
				+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
			
 
				+
			
 
				+        }
			
 
				+        tmp += dall * sum1 - dmin * sum2;
			
 
				+
			
 
				+    }
			
 
				+#else
			
 
				+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
			
 
				+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
			
 
				+    const int offset = tid * K_QUANTS_PER_ITERATION;
			
 
				+
			
 
				+    uint32_t uaux[2];
			
 
				+    const uint8_t * d = (const uint8_t *)uaux;
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y = yy + i * QK_K + offset;
			
 
				+        const uint8_t * q = x[i].qs + offset;
			
 
				+        const uint32_t * s = (const uint32_t *)x[i].scales;
			
 
				+
			
 
				+        uaux[0] = s[0] & 0x0f0f0f0f;
			
 
				+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
			
 
				+
			
 
				+        const float2 dall = __half22float2(x[i].dm);
			
 
				+
			
 
				+        float sum1 = 0, sum2 = 0;
			
 
				+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				+            const uint8_t ql = q[l];
			
 
				+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
			
 
				+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
			
 
				+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
			
 
				+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
			
 
				+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
			
 
				+        }
			
 
				+        tmp += dall.x * sum1 - dall.y * sum2;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    // sum up partial sums and write back result
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+
			
 
				+    if (threadIdx.x == 0) {
			
 
				+        dst[row] = tmp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
			
 
				+
			
 
				+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+    if (row > nrows) return;
			
 
				+
			
 
				+    const int num_blocks_per_row = ncols / QK_K;
			
 
				+    const int ib0 = row*num_blocks_per_row;
			
 
				+
			
 
				+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
			
 
				+
			
 
				+    float tmp = 0; // partial sum for thread in warp
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+
			
 
				+    const uint16_t kmask1 = 0x0303;
			
 
				+    const uint16_t kmask2 = 0x0f0f;
			
 
				+
			
 
				+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
			
 
				+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
			
 
				+
			
 
				+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
			
 
				+    const int step = 16/K_QUANTS_PER_ITERATION;
			
 
				+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
			
 
				+    const int in = tid - step*im;                        // 0....15 or 0...7
			
 
				+
			
 
				+    const uint8_t m = 1 << (4*im);
			
 
				+
			
 
				+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
			
 
				+    const int q_offset =  32*im + l0;
			
 
				+    const int y_offset = 128*im + l0;
			
 
				+
			
 
				+    uint16_t utmp[4];
			
 
				+    const int8_t * s = (const int8_t *)utmp;
			
 
				+
			
 
				+    const uint16_t s_shift = 4*im;
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y  = yy + i * QK_K + y_offset;
			
 
				+        const uint8_t * q = x[i].qs + q_offset;
			
 
				+        const uint8_t * h = x[i].hmask + l0;
			
 
				+
			
 
				+        const uint16_t * a = (const uint16_t *)x[i].scales;
			
 
				+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
			
 
				+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
			
 
				+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
			
 
				+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
			
 
				+
			
 
				+        const float d = x[i].d;
			
 
				+
			
 
				+        float sum = 0;
			
 
				+        for (int l = 0; l < n; ++l) {
			
 
				+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
			
 
				+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
			
 
				+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
			
 
				+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
			
 
				+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
			
 
				+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
			
 
				+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
			
 
				+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
			
 
				+        }
			
 
				+        tmp += d * sum;
			
 
				+
			
 
				+    }
			
 
				+#else
			
 
				+
			
 
				+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
			
 
				+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
			
 
				+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
			
 
				+    const int in = offset/8;                                 // 0 or 1
			
 
				+    const int im = offset%8;                                 // 0...7
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y = yy + i * QK_K + offset;
			
 
				+        const uint8_t * q = x[i].qs + offset;
			
 
				+        const uint8_t * s = x[i].scales;
			
 
				+
			
 
				+        const float dall = (float)x[i].d;
			
 
				+
			
 
				+        float sum = 0;
			
 
				+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
			
 
				+            const uint8_t hl = x[i].hmask[im+l] >> in;
			
 
				+            const uint8_t ql = q[l];
			
 
				+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
			
 
				+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
			
 
				+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
			
 
				+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
			
 
				+        }
			
 
				+        tmp += sum;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    // sum up partial sums and write back result
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+
			
 
				+    if (threadIdx.x == 0) {
			
 
				+        dst[row] = tmp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
			
 
				+
			
 
				+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+    if (row > nrows) return;
			
 
				+    const int num_blocks_per_row = ncols / QK_K;
			
 
				+    const int ib0 = row*num_blocks_per_row;
			
 
				+
			
 
				+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    const uint16_t kmask1 = 0x3f3f;
			
 
				+    const uint16_t kmask2 = 0x0f0f;
			
 
				+    const uint16_t kmask3 = 0xc0c0;
			
 
				+
			
 
				+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
			
 
				+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1
			
 
				+
			
 
				+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
			
 
				+
			
 
				+    const int il  = tid/step;                            // 0...3
			
 
				+    const int ir  = tid - step*il;                       // 0...7 or 0...3
			
 
				+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
			
 
				+
			
 
				+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
			
 
				+    const int in = il%2;
			
 
				+
			
 
				+    const int l0 = n*(2*ir + in);
			
 
				+    const int q_offset = 32*im + l0;
			
 
				+    const int y_offset = 64*im + l0;
			
 
				+
			
 
				+    uint16_t aux[4];
			
 
				+    const uint8_t * sc = (const uint8_t *)aux;
			
 
				+
			
 
				+#if K_QUANTS_PER_ITERATION == 2
			
 
				+    uint32_t q32[4];
			
 
				+    const uint8_t * q4 = (const uint8_t *)q32;
			
 
				+#else
			
 
				+    uint16_t q16[4];
			
 
				+    const uint8_t * q4 = (const uint8_t *)q16;
			
 
				+#endif
			
 
				+
			
 
				+    float tmp = 0; // partial sum for thread in warp
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y1 = yy + i*QK_K + y_offset;
			
 
				+        const float   * y2 = y1 + 128;
			
 
				+
			
 
				+        const float dall = __low2half(x[i].dm);
			
 
				+        const float dmin = __high2half(x[i].dm);
			
 
				+
			
 
				+        const uint16_t * a = (const uint16_t *)x[i].scales;
			
 
				+        aux[0] = a[im+0] & kmask1;
			
 
				+        aux[1] = a[im+2] & kmask1;
			
 
				+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
			
 
				+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
			
 
				+
			
 
				+#if K_QUANTS_PER_ITERATION == 2
			
 
				+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
			
 
				+        const uint32_t * q2 = q1 + 16;
			
 
				+
			
 
				+        q32[0] = q1[0] & 0x0f0f0f0f;
			
 
				+        q32[1] = q1[0] & 0xf0f0f0f0;
			
 
				+        q32[2] = q2[0] & 0x0f0f0f0f;
			
 
				+        q32[3] = q2[0] & 0xf0f0f0f0;
			
 
				+
			
 
				+        float4 s = {0.f, 0.f, 0.f, 0.f};
			
 
				+        float smin = 0;
			
 
				+        for (int l = 0; l < 4; ++l) {
			
 
				+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
			
 
				+            s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
			
 
				+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
			
 
				+        }
			
 
				+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
			
 
				+#else
			
 
				+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
			
 
				+        const uint16_t * q2 = q1 + 32;
			
 
				+
			
 
				+        q16[0] = q1[0] & 0x0f0f;
			
 
				+        q16[1] = q1[0] & 0xf0f0;
			
 
				+        q16[2] = q2[0] & 0x0f0f;
			
 
				+        q16[3] = q2[0] & 0xf0f0;
			
 
				+
			
 
				+        float4 s = {0.f, 0.f, 0.f, 0.f};
			
 
				+        float smin = 0;
			
 
				+        for (int l = 0; l < 2; ++l) {
			
 
				+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
			
 
				+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
			
 
				+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
			
 
				+        }
			
 
				+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
			
 
				+#endif
			
 
				+
			
 
				+    }
			
 
				+#else
			
 
				+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
			
 
				+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
			
 
				+
			
 
				+    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				+
			
 
				+    uint16_t aux16[2];
			
 
				+    const uint8_t * s = (const uint8_t *)aux16;
			
 
				+
			
 
				+    float tmp = 0;
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				+        const uint8_t * q = x[i].qs + step;
			
 
				+        const float   * y = yy + i*QK_K + step;
			
 
				+        const uint16_t * a = (const uint16_t *)x[i].scales;
			
 
				+        aux16[0] = a[0] & 0x0f0f;
			
 
				+        aux16[1] = (a[0] >> 4) & 0x0f0f;
			
 
				+        const float d = (float)x[i].dm[0];
			
 
				+        const float m = (float)x[i].dm[1];
			
 
				+        float sum = 0.f;
			
 
				+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
			
 
				+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
			
 
				+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
			
 
				+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
			
 
				+        }
			
 
				+        tmp += sum;
			
 
				+    }
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+    // sum up partial sums and write back result
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+
			
 
				+    if (tid == 0) {
			
 
				+        dst[row] = tmp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
			
 
				+
			
 
				+    const int row = blockIdx.x;
			
 
				+    const int num_blocks_per_row = ncols / QK_K;
			
 
				+    const int ib0 = row*num_blocks_per_row;
			
 
				+
			
 
				+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
			
 
				+
			
 
				+    float tmp = 0; // partial sum for thread in warp
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    const uint16_t kmask1 = 0x3f3f;
			
 
				+    const uint16_t kmask2 = 0x0f0f;
			
 
				+    const uint16_t kmask3 = 0xc0c0;
			
 
				+
			
 
				+    const int tid = threadIdx.x/2;  // 0...15
			
 
				+    const int ix  = threadIdx.x%2;
			
 
				+
			
 
				+    const int il  = tid/4;     // 0...3
			
 
				+    const int ir  = tid - 4*il;// 0...3
			
 
				+    const int n   = 2;
			
 
				+
			
 
				+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
			
 
				+    const int in = il%2;
			
 
				+
			
 
				+    const int l0 = n*(2*ir + in);
			
 
				+    const int q_offset = 32*im + l0;
			
 
				+    const int y_offset = 64*im + l0;
			
 
				+
			
 
				+    const uint8_t hm1  = 1 << (2*im);
			
 
				+    const uint8_t hm2  = hm1 << 4;
			
 
				+
			
 
				+    uint16_t aux[4];
			
 
				+    const uint8_t * sc = (const uint8_t *)aux;
			
 
				+
			
 
				+    uint16_t q16[8];
			
 
				+    const uint8_t * q4 = (const uint8_t *)q16;
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += 2) {
			
 
				+
			
 
				+        const uint8_t * ql1 = x[i].qs + q_offset;
			
 
				+        const uint8_t * qh  = x[i].qh + l0;
			
 
				+        const float   * y1  = yy + i*QK_K + y_offset;
			
 
				+        const float   * y2  = y1 + 128;
			
 
				+
			
 
				+        const float dall = __low2half(x[i].dm);
			
 
				+        const float dmin = __high2half(x[i].dm);
			
 
				+
			
 
				+        const uint16_t * a = (const uint16_t *)x[i].scales;
			
 
				+        aux[0] = a[im+0] & kmask1;
			
 
				+        aux[1] = a[im+2] & kmask1;
			
 
				+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
			
 
				+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
			
 
				+
			
 
				+        float4 sum = {0.f, 0.f, 0.f, 0.f};
			
 
				+        float smin = 0;
			
 
				+        const uint16_t * q1 = (const uint16_t *)ql1;
			
 
				+        const uint16_t * q2 = q1 + 32;
			
 
				+        q16[0] = q1[0] & 0x0f0f;
			
 
				+        q16[1] = q1[8] & 0x0f0f;
			
 
				+        q16[2] = (q1[0] >> 4) & 0x0f0f;
			
 
				+        q16[3] = (q1[8] >> 4) & 0x0f0f;
			
 
				+        q16[4] = q2[0] & 0x0f0f;
			
 
				+        q16[5] = q2[8] & 0x0f0f;
			
 
				+        q16[6] = (q2[0] >> 4) & 0x0f0f;
			
 
				+        q16[7] = (q2[8] >> 4) & 0x0f0f;
			
 
				+        for (int l = 0; l < n; ++l) {
			
 
				+            sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
			
 
				+                   + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
			
 
				+            sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
			
 
				+                   + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
			
 
				+            sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
			
 
				+                   + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
			
 
				+            sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
			
 
				+                   + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
			
 
				+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
			
 
				+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
			
 
				+        }
			
 
				+        tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
			
 
				+    }
			
 
				+
			
 
				+#else
			
 
				+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
			
 
				+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
			
 
				+    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				+    const int im = step/8;
			
 
				+    const int in = step%8;
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				+        const uint8_t * q = x[i].qs + step;
			
 
				+        const int8_t  * s = x[i].scales;
			
 
				+        const float   * y = yy + i*QK_K + step;
			
 
				+        const float     d = x[i].d;
			
 
				+        float sum = 0.f;
			
 
				+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				+            const uint8_t h = x[i].qh[in+j] >> im;
			
 
				+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
			
 
				+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
			
 
				+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
			
 
				+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
			
 
				+        }
			
 
				+        tmp += sum;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    // sum up partial sums and write back result
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+
			
 
				+    if (threadIdx.x == 0) {
			
 
				+        dst[row] = tmp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
			
 
				+
			
 
				+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
			
 
				+
			
 
				+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+    if (row > nrows) return;
			
 
				+
			
 
				+    const int num_blocks_per_row = ncols / QK_K;
			
 
				+    const int ib0 = row*num_blocks_per_row;
			
 
				+
			
 
				+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+
			
 
				+    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
			
 
				+    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
			
 
				+
			
 
				+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
			
 
				+
			
 
				+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
			
 
				+    const int in = tid - step*im;                        // 0...15 or 0...7
			
 
				+
			
 
				+#if K_QUANTS_PER_ITERATION == 1
			
 
				+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
			
 
				+    const int is = 0;
			
 
				+#else
			
 
				+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
			
 
				+    const int is = in / 4;
			
 
				+#endif
			
 
				+    const int ql_offset = 64*im + l0;
			
 
				+    const int qh_offset = 32*im + l0;
			
 
				+    const int s_offset  =  8*im + is;
			
 
				+    const int y_offset = 128*im + l0;
			
 
				+
			
 
				+    float tmp = 0; // partial sum for thread in warp
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y  = yy + i * QK_K + y_offset;
			
 
				+        const uint8_t * ql = x[i].ql + ql_offset;
			
 
				+        const uint8_t * qh = x[i].qh + qh_offset;
			
 
				+        const int8_t  * s  = x[i].scales + s_offset;
			
 
				+
			
 
				+        const float d = x[i].d;
			
 
				+
			
 
				+#if K_QUANTS_PER_ITERATION == 1
			
 
				+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
			
 
				+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
			
 
				+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
			
 
				+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
			
 
				+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
			
 
				+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
			
 
				+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
			
 
				+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
			
 
				+        tmp += sum;
			
 
				+#else
			
 
				+        float sum = 0;
			
 
				+        for (int l = 0; l < 4; ++l) {
			
 
				+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
			
 
				+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
			
 
				+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
			
 
				+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
			
 
				+        }
			
 
				+        tmp += sum;
			
 
				+#endif
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
			
 
				+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
			
 
				+
			
 
				+    const int step = tid * K_QUANTS_PER_ITERATION;
			
 
				+
			
 
				+    float tmp = 0; // partial sum for thread in warp
			
 
				+
			
 
				+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
			
 
				+
			
 
				+        const float   * y  = yy + i * QK_K + step;
			
 
				+        const uint8_t * ql = x[i].ql + step;
			
 
				+        const uint8_t * qh = x[i].qh + step;
			
 
				+        const int8_t  * s  = x[i].scales;
			
 
				+
			
 
				+        const float d = x[i+0].d;
			
 
				+
			
 
				+        float sum = 0;
			
 
				+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
			
 
				+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
			
 
				+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
			
 
				+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
			
 
				+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
			
 
				+        }
			
 
				+        tmp += sum;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+    // sum up partial sums and write back result
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+
			
 
				+    if (tid == 0) {
			
 
				+        dst[row] = tmp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				+    const half * x = (const half *) vx;
			
 
				+
			
 
				+    // automatic half -> float type cast if dfloat == float
			
 
				+    v.x = x[ib + iqs + 0];
			
 
				+    v.y = x[ib + iqs + 1];
			
 
				+}
			
 
				+
			
 
				+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
			
 
				+static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
			
 
				+    // qk = quantized weights per x block
			
 
				+    // qr = number of quantized weights per data value in x block
			
 
				+    const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+
			
 
				+    if (row >= nrows) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int tid = threadIdx.x;
			
 
				+
			
 
				+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
			
 
				+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
			
 
				+    const int y_offset = qr == 1 ? 1 : qk/2;
			
 
				+
			
 
				+// partial sum for each thread
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
			
 
				+#else
			
 
				+    float tmp = 0.0f;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+
			
 
				+    for (int i = 0; i < ncols; i += iter_stride) {
			
 
				+        const int col = i + vals_per_iter*tid;
			
 
				+        const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
			
 
				+        const int iqs = (col%qk)/qr; // x quant index
			
 
				+        const int iybs = col - col%qk; // y block start index
			
 
				+
			
 
				+// processing >2 values per i iter is faster for fast GPUs
			
 
				+#pragma unroll
			
 
				+        for (int j = 0; j < vals_per_iter; j += 2) {
			
 
				+            // process 2 vals per j iter
			
 
				+
			
 
				+            // dequantize
			
 
				+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
			
 
				+            dfloat2 v;
			
 
				+            dequantize_kernel(vx, ib, iqs + j/qr, v);
			
 
				+
			
 
				+            // matrix multiplication
			
 
				+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+            tmp += __hmul2(v, {
			
 
				+                y[iybs + iqs + j/qr + 0],
			
 
				+                y[iybs + iqs + j/qr + y_offset]
			
 
				+            });
			
 
				+#else
			
 
				+            tmp += v.x * y[iybs + iqs + j/qr + 0];
			
 
				+            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
			
 
				+#endif // GGML_CUDA_F16
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // sum up partial sums and write back result
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+
			
 
				+    if (tid == 0) {
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+        dst[row] = tmp.x + tmp.y;
			
 
				+#else
			
 
				+        dst[row] = tmp;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
 
				+    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
			
 
				+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
 
				+    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
			
 
				+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
 
				+    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
			
 
				+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
 
				+    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
			
 
				+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
 
				+    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
			
 
				+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % QK_K == 0);
			
 
				+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
			
 
				+    const int block_num_y = (nrows + ny - 1) / ny;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(32, ny, 1);
			
 
				+    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % QK_K == 0);
			
 
				+    const int ny = 2 / K_QUANTS_PER_ITERATION;
			
 
				+    const int block_num_y = (nrows + ny - 1) / ny;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(32, ny, 1);
			
 
				+    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % QK_K == 0);
			
 
				+    const int ny = 2 / K_QUANTS_PER_ITERATION;
			
 
				+    const int block_num_y = (nrows + ny - 1) / ny;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(32, ny, 1);
			
 
				+    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % QK_K == 0);
			
 
				+    const dim3 block_dims(32, 1, 1);
			
 
				+    dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
			
 
				+}
			
 
				+
			
 
				+static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % QK_K == 0);
			
 
				+    const int ny = 2 / K_QUANTS_PER_ITERATION;
			
 
				+    const int block_num_y = (nrows + ny - 1) / ny;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(32, ny, 1);
			
 
				+    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				+    const dim3 block_nums(block_num_y, 1, 1);
			
 
				+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
 
				+    dequantize_mul_mat_vec<1, 1, convert_f16>
			
 
				+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_dequantize_mul_mat_vec(
			
 
				+    ggml_backend_cuda_context & ctx,
			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				+    const int64_t src1_padded_row_size, cudaStream_t stream) {
			
 
				+    GGML_UNUSED(ctx);
			
 
				+    const int64_t ne00 = src0->ne[0];
			
 
				+    const int64_t row_diff = row_high - row_low;
			
 
				+
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    ggml_cuda_pool_alloc<half> src1_dfloat_a(ctx.pool());
			
 
				+    half * src1_dfloat = nullptr; // dfloat == half
			
 
				+
			
 
				+    bool src1_convert_f16 =
			
 
				+        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
			
 
				+        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
			
 
				+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
			
 
				+
			
 
				+    if (src1_convert_f16) {
			
 
				+        src1_dfloat = src1_dfloat_a.alloc(ne00);
			
 
				+        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
			
 
				+        GGML_ASSERT(to_fp16_cuda != nullptr);
			
 
				+        to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
			
 
				+    }
			
 
				+#else
			
 
				+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
			
 
				+#endif // GGML_CUDA_F16
			
 
				+
			
 
				+    switch (src0->type) {
			
 
				+        case GGML_TYPE_Q4_0:
			
 
				+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q4_1:
			
 
				+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q5_0:
			
 
				+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q5_1:
			
 
				+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q8_0:
			
 
				+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q2_K:
			
 
				+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q3_K:
			
 
				+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q4_K:
			
 
				+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q5_K:
			
 
				+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q6_K:
			
 
				+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_F16:
			
 
				+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
			
 
				+            break;
			
 
				+        default:
			
 
				+            GGML_ASSERT(false);
			
 
				+            break;
			
 
				+    }
			
 
				+
			
 
				+    GGML_UNUSED(src1);
			
 
				+    GGML_UNUSED(dst);
			
 
				+    GGML_UNUSED(src1_ddq_i);
			
 
				+    GGML_UNUSED(src1_ncols);
			
 
				+    GGML_UNUSED(src1_padded_row_size);
			
 
				+}
			
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,18 +1,18 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-// dmmv = dequantize_mul_mat_vec

			
 
				-

			
 
				-// TODO: remove this?

			
 
				-#ifndef GGML_CUDA_DMMV_X

			
 
				-#define GGML_CUDA_DMMV_X 32

			
 
				-#endif

			
 
				-

			
 
				-#ifndef GGML_CUDA_MMV_Y

			
 
				-#define GGML_CUDA_MMV_Y 1

			
 
				-#endif

			
 
				-

			
 
				-void ggml_cuda_op_dequantize_mul_mat_vec(

			
 
				-    ggml_backend_cuda_context & ctx,

			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+// dmmv = dequantize_mul_mat_vec
			
 
				+
			
 
				+// TODO: remove this?
			
 
				+#ifndef GGML_CUDA_DMMV_X
			
 
				+#define GGML_CUDA_DMMV_X 32
			
 
				+#endif
			
 
				+
			
 
				+#ifndef GGML_CUDA_MMV_Y
			
 
				+#define GGML_CUDA_MMV_Y 1
			
 
				+#endif
			
 
				+
			
 
				+void ggml_cuda_op_dequantize_mul_mat_vec(
			
 
				+    ggml_backend_cuda_context & ctx,
			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				+    const int64_t src1_padded_row_size, cudaStream_t stream);
			
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,178 +1,178 @@
 
				-#include "getrows.cuh"

			
 
				-#include "dequantize.cuh"

			
 
				-

			
 
				-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>

			
 
				-static __global__ void k_get_rows(

			
 
				-            const void * src0, const int32_t * src1, dst_t * dst,

			
 
				-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/

			
 
				-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/

			
 
				-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,

			
 
				-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,

			
 
				-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

			
 
				-

			
 
				-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;

			
 
				-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;

			
 
				-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

			
 
				-

			
 
				-    if (i00 >= ne00) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

			
 
				-

			
 
				-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;

			
 
				-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;

			
 
				-

			
 
				-    const int ib = i00/qk; // block index

			
 
				-    const int iqs = (i00%qk)/qr; // quant index

			
 
				-    const int iybs = i00 - i00%qk; // dst block start index

			
 
				-    const int y_offset = qr == 1 ? 1 : qk/2;

			
 
				-

			
 
				-    // dequantize

			
 
				-    dfloat2 v;

			
 
				-    dequantize_kernel(src0_row, ib, iqs, v);

			
 
				-

			
 
				-    dst_row[iybs + iqs + 0]        = v.x;

			
 
				-    dst_row[iybs + iqs + y_offset] = v.y;

			
 
				-}

			
 
				-

			
 
				-template<typename src0_t, typename dst_t>

			
 
				-static __global__ void k_get_rows_float(

			
 
				-            const src0_t * src0, const int32_t * src1, dst_t * dst,

			
 
				-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/

			
 
				-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/

			
 
				-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,

			
 
				-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,

			
 
				-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

			
 
				-

			
 
				-    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;

			
 
				-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;

			
 
				-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

			
 
				-

			
 
				-    if (i00 >= ne00) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

			
 
				-

			
 
				-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;

			
 
				-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);

			
 
				-

			
 
				-    dst_row[i00] = src0_row[i00];

			
 
				-}

			
 
				-

			
 
				-template<int qk, int qr, dequantize_kernel_t dq>

			
 
				-static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

			
 
				-                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

			
 
				-

			
 
				-    GGML_TENSOR_BINARY_OP_LOCALS

			
 
				-

			
 
				-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);

			
 
				-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);

			
 
				-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

			
 
				-

			
 
				-    // strides in elements

			
 
				-    //const size_t s0 = nb0 / ggml_element_size(dst);

			
 
				-    const size_t s1 = nb1 / ggml_element_size(dst);

			
 
				-    const size_t s2 = nb2 / ggml_element_size(dst);

			
 
				-    const size_t s3 = nb3 / ggml_element_size(dst);

			
 
				-

			
 
				-    const size_t s10 = nb10 / ggml_element_size(src1);

			
 
				-    const size_t s11 = nb11 / ggml_element_size(src1);

			
 
				-    const size_t s12 = nb12 / ggml_element_size(src1);

			
 
				-    //const size_t s13 = nb13 / ggml_element_size(src1);

			
 
				-

			
 
				-    GGML_ASSERT(ne00 % 2 == 0);

			
 
				-

			
 
				-    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(

			
 
				-            src0_dd, src1_dd, dst_dd,

			
 
				-            ne00, /*ne01, ne02, ne03,*/

			
 
				-            /*ne10, ne11,*/ ne12, /*ne13,*/

			
 
				-            /* s0,*/ s1, s2, s3,

			
 
				-            /* nb00,*/ nb01, nb02, nb03,

			
 
				-            s10, s11, s12/*, s13*/);

			
 
				-

			
 
				-    GGML_UNUSED(dst);

			
 
				-}

			
 
				-

			
 
				-template<typename src0_t>

			
 
				-static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

			
 
				-                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

			
 
				-

			
 
				-    GGML_TENSOR_BINARY_OP_LOCALS

			
 
				-

			
 
				-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);

			
 
				-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;

			
 
				-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

			
 
				-

			
 
				-    // strides in elements

			
 
				-    //const size_t s0 = nb0 / ggml_element_size(dst);

			
 
				-    const size_t s1 = nb1 / ggml_element_size(dst);

			
 
				-    const size_t s2 = nb2 / ggml_element_size(dst);

			
 
				-    const size_t s3 = nb3 / ggml_element_size(dst);

			
 
				-

			
 
				-    const size_t s10 = nb10 / ggml_element_size(src1);

			
 
				-    const size_t s11 = nb11 / ggml_element_size(src1);

			
 
				-    const size_t s12 = nb12 / ggml_element_size(src1);

			
 
				-    //const size_t s13 = nb13 / ggml_element_size(src1);

			
 
				-

			
 
				-    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(

			
 
				-            src0_dd, src1_dd, dst_dd,

			
 
				-            ne00, /*ne01, ne02, ne03,*/

			
 
				-            /*ne10, ne11,*/ ne12, /*ne13,*/

			
 
				-            /* s0,*/ s1, s2, s3,

			
 
				-            /* nb00,*/ nb01, nb02, nb03,

			
 
				-            s10, s11, s12/*, s13*/);

			
 
				-

			
 
				-    GGML_UNUSED(dst);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const ggml_tensor * src1 = dst->src[1];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    const float * src1_d = (const float *)src1->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-

			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_I32);

			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));

			
 
				-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));

			
 
				-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));

			
 
				-

			
 
				-    const int32_t * src1_i32 = (const int32_t *) src1_d;

			
 
				-

			
 
				-    switch (src0->type) {

			
 
				-        case GGML_TYPE_F16:

			
 
				-            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_F32:

			
 
				-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q4_0:

			
 
				-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q4_1:

			
 
				-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q5_0:

			
 
				-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q5_1:

			
 
				-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        case GGML_TYPE_Q8_0:

			
 
				-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				-            break;

			
 
				-        default:

			
 
				-            // TODO: k-quants

			
 
				-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));

			
 
				-            GGML_ASSERT(false);

			
 
				-            break;

			
 
				-    }

			
 
				-}

			
 
				+#include "getrows.cuh"
			
 
				+#include "dequantize.cuh"
			
 
				+
			
 
				+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
			
 
				+static __global__ void k_get_rows(
			
 
				+            const void * src0, const int32_t * src1, dst_t * dst,
			
 
				+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
			
 
				+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
			
 
				+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
			
 
				+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
			
 
				+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
			
 
				+
			
 
				+    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
			
 
				+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
			
 
				+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
			
 
				+
			
 
				+    if (i00 >= ne00) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
			
 
				+
			
 
				+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
			
 
				+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
			
 
				+
			
 
				+    const int ib = i00/qk; // block index
			
 
				+    const int iqs = (i00%qk)/qr; // quant index
			
 
				+    const int iybs = i00 - i00%qk; // dst block start index
			
 
				+    const int y_offset = qr == 1 ? 1 : qk/2;
			
 
				+
			
 
				+    // dequantize
			
 
				+    dfloat2 v;
			
 
				+    dequantize_kernel(src0_row, ib, iqs, v);
			
 
				+
			
 
				+    dst_row[iybs + iqs + 0]        = v.x;
			
 
				+    dst_row[iybs + iqs + y_offset] = v.y;
			
 
				+}
			
 
				+
			
 
				+template<typename src0_t, typename dst_t>
			
 
				+static __global__ void k_get_rows_float(
			
 
				+            const src0_t * src0, const int32_t * src1, dst_t * dst,
			
 
				+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
			
 
				+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
			
 
				+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
			
 
				+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
			
 
				+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
			
 
				+
			
 
				+    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
			
 
				+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
			
 
				+
			
 
				+    if (i00 >= ne00) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
			
 
				+
			
 
				+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
			
 
				+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
			
 
				+
			
 
				+    dst_row[i00] = src0_row[i00];
			
 
				+}
			
 
				+
			
 
				+template<int qk, int qr, dequantize_kernel_t dq>
			
 
				+static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
 
				+                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
			
 
				+
			
 
				+    GGML_TENSOR_BINARY_OP_LOCALS
			
 
				+
			
 
				+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
			
 
				+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
			
 
				+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
			
 
				+
			
 
				+    // strides in elements
			
 
				+    //const size_t s0 = nb0 / ggml_element_size(dst);
			
 
				+    const size_t s1 = nb1 / ggml_element_size(dst);
			
 
				+    const size_t s2 = nb2 / ggml_element_size(dst);
			
 
				+    const size_t s3 = nb3 / ggml_element_size(dst);
			
 
				+
			
 
				+    const size_t s10 = nb10 / ggml_element_size(src1);
			
 
				+    const size_t s11 = nb11 / ggml_element_size(src1);
			
 
				+    const size_t s12 = nb12 / ggml_element_size(src1);
			
 
				+    //const size_t s13 = nb13 / ggml_element_size(src1);
			
 
				+
			
 
				+    GGML_ASSERT(ne00 % 2 == 0);
			
 
				+
			
 
				+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
			
 
				+            src0_dd, src1_dd, dst_dd,
			
 
				+            ne00, /*ne01, ne02, ne03,*/
			
 
				+            /*ne10, ne11,*/ ne12, /*ne13,*/
			
 
				+            /* s0,*/ s1, s2, s3,
			
 
				+            /* nb00,*/ nb01, nb02, nb03,
			
 
				+            s10, s11, s12/*, s13*/);
			
 
				+
			
 
				+    GGML_UNUSED(dst);
			
 
				+}
			
 
				+
			
 
				+template<typename src0_t>
			
 
				+static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
 
				+                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
			
 
				+
			
 
				+    GGML_TENSOR_BINARY_OP_LOCALS
			
 
				+
			
 
				+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
			
 
				+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
			
 
				+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
			
 
				+
			
 
				+    // strides in elements
			
 
				+    //const size_t s0 = nb0 / ggml_element_size(dst);
			
 
				+    const size_t s1 = nb1 / ggml_element_size(dst);
			
 
				+    const size_t s2 = nb2 / ggml_element_size(dst);
			
 
				+    const size_t s3 = nb3 / ggml_element_size(dst);
			
 
				+
			
 
				+    const size_t s10 = nb10 / ggml_element_size(src1);
			
 
				+    const size_t s11 = nb11 / ggml_element_size(src1);
			
 
				+    const size_t s12 = nb12 / ggml_element_size(src1);
			
 
				+    //const size_t s13 = nb13 / ggml_element_size(src1);
			
 
				+
			
 
				+    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
			
 
				+            src0_dd, src1_dd, dst_dd,
			
 
				+            ne00, /*ne01, ne02, ne03,*/
			
 
				+            /*ne10, ne11,*/ ne12, /*ne13,*/
			
 
				+            /* s0,*/ s1, s2, s3,
			
 
				+            /* nb00,*/ nb01, nb02, nb03,
			
 
				+            s10, s11, s12/*, s13*/);
			
 
				+
			
 
				+    GGML_UNUSED(dst);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    const float * src1_d = (const float *)src1->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
			
 
				+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
			
 
				+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
			
 
				+
			
 
				+    const int32_t * src1_i32 = (const int32_t *) src1_d;
			
 
				+
			
 
				+    switch (src0->type) {
			
 
				+        case GGML_TYPE_F16:
			
 
				+            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_F32:
			
 
				+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q4_0:
			
 
				+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q4_1:
			
 
				+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q5_0:
			
 
				+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q5_1:
			
 
				+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        case GGML_TYPE_Q8_0:
			
 
				+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				+            break;
			
 
				+        default:
			
 
				+            // TODO: k-quants
			
 
				+            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
			
 
				+            GGML_ASSERT(false);
			
 
				+            break;
			
 
				+    }
			
 
				+}
			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_GET_ROWS_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_GET_ROWS_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,104 +1,104 @@
 
				-#include "im2col.cuh"

			
 
				-

			
 
				-template <typename T>

			
 
				-static  __global__ void im2col_kernel(

			
 
				-        const float * x, T * dst, int64_t batch_offset,

			
 
				-        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,

			
 
				-        int s0, int s1, int p0, int p1, int d0, int d1) {

			
 
				-    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				-    if (i >= pelements) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int64_t  ksize = OW * (KH > 1 ? KW : 1);

			
 
				-    const int64_t  kx = i / ksize;

			
 
				-    const int64_t  kd = kx * ksize;

			
 
				-    const int64_t  ky = (i - kd) / OW;

			
 
				-    const int64_t  ix = i % OW;

			
 
				-

			
 
				-    const int64_t  oh = blockIdx.y;

			
 
				-    const int64_t  batch = blockIdx.z / IC;

			
 
				-    const int64_t  ic = blockIdx.z % IC;

			
 
				-

			
 
				-    const int64_t iiw = ix * s0 + kx * d0 - p0;

			
 
				-    const int64_t iih = oh * s1 + ky * d1 - p1;

			
 
				-

			
 
				-    const int64_t offset_dst =

			
 
				-        ((batch * OH + oh) * OW + ix) * CHW +

			
 
				-        (ic * (KW * KH) + ky * KW + kx);

			
 
				-

			
 
				-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {

			
 
				-        dst[offset_dst] = 0.0f;

			
 
				-    } else {

			
 
				-        const int64_t offset_src = ic * offset_delta + batch * batch_offset;

			
 
				-        dst[offset_dst] = x[offset_src + iih * IW + iiw];

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-template <typename T>

			
 
				-static void im2col_cuda(const float * x, T* dst,

			
 
				-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

			
 
				-    int64_t batch, int64_t batch_offset, int64_t offset_delta,

			
 
				-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

			
 
				-    const int parallel_elements = OW * KW * KH;

			
 
				-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;

			
 
				-    dim3 block_nums(num_blocks, OH, batch * IC);

			
 
				-    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);

			
 
				-}

			
 
				-

			
 
				-static void im2col_cuda_f16(const float * x, half * dst,

			
 
				-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

			
 
				-    int64_t batch, int64_t batch_offset, int64_t offset_delta,

			
 
				-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

			
 
				-

			
 
				-    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);

			
 
				-}

			
 
				-

			
 
				-static void im2col_cuda_f32(const float * x, float * dst,

			
 
				-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

			
 
				-    int64_t batch, int64_t batch_offset, int64_t offset_delta,

			
 
				-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

			
 
				-

			
 
				-    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const ggml_tensor * src1 = dst->src[1];

			
 
				-    const float * src1_d = (const float *)src1->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F16);

			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];

			
 
				-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];

			
 
				-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];

			
 
				-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];

			
 
				-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];

			
 
				-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

			
 
				-

			
 
				-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

			
 
				-

			
 
				-    const int64_t IC = src1->ne[is_2D ? 2 : 1];

			
 
				-    const int64_t IH = is_2D ? src1->ne[1] : 1;

			
 
				-    const int64_t IW =         src1->ne[0];

			
 
				-

			
 
				-    const int64_t KH = is_2D ? src0->ne[1] : 1;

			
 
				-    const int64_t KW =         src0->ne[0];

			
 
				-

			
 
				-    const int64_t OH = is_2D ? dst->ne[2] : 1;

			
 
				-    const int64_t OW =         dst->ne[1];

			
 
				-

			
 
				-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32

			
 
				-    const int64_t batch = src1->ne[3];

			
 
				-    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32

			
 
				-

			
 
				-    if(dst->type == GGML_TYPE_F16) {

			
 
				-        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);

			
 
				-    } else {

			
 
				-        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);

			
 
				-    }

			
 
				-}

			
 
				+#include "im2col.cuh"
			
 
				+
			
 
				+template <typename T>
			
 
				+static  __global__ void im2col_kernel(
			
 
				+        const float * x, T * dst, int64_t batch_offset,
			
 
				+        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
			
 
				+        int s0, int s1, int p0, int p1, int d0, int d1) {
			
 
				+    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if (i >= pelements) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
			
 
				+    const int64_t  kx = i / ksize;
			
 
				+    const int64_t  kd = kx * ksize;
			
 
				+    const int64_t  ky = (i - kd) / OW;
			
 
				+    const int64_t  ix = i % OW;
			
 
				+
			
 
				+    const int64_t  oh = blockIdx.y;
			
 
				+    const int64_t  batch = blockIdx.z / IC;
			
 
				+    const int64_t  ic = blockIdx.z % IC;
			
 
				+
			
 
				+    const int64_t iiw = ix * s0 + kx * d0 - p0;
			
 
				+    const int64_t iih = oh * s1 + ky * d1 - p1;
			
 
				+
			
 
				+    const int64_t offset_dst =
			
 
				+        ((batch * OH + oh) * OW + ix) * CHW +
			
 
				+        (ic * (KW * KH) + ky * KW + kx);
			
 
				+
			
 
				+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
			
 
				+        dst[offset_dst] = 0.0f;
			
 
				+    } else {
			
 
				+        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
			
 
				+        dst[offset_dst] = x[offset_src + iih * IW + iiw];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename T>
			
 
				+static void im2col_cuda(const float * x, T* dst,
			
 
				+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
			
 
				+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
			
 
				+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
			
 
				+    const int parallel_elements = OW * KW * KH;
			
 
				+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
			
 
				+    dim3 block_nums(num_blocks, OH, batch * IC);
			
 
				+    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
			
 
				+}
			
 
				+
			
 
				+static void im2col_cuda_f16(const float * x, half * dst,
			
 
				+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
			
 
				+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
			
 
				+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
			
 
				+
			
 
				+    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
			
 
				+}
			
 
				+
			
 
				+static void im2col_cuda_f32(const float * x, float * dst,
			
 
				+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
			
 
				+    int64_t batch, int64_t batch_offset, int64_t offset_delta,
			
 
				+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
			
 
				+
			
 
				+    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+    const float * src1_d = (const float *)src1->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
			
 
				+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
			
 
				+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
			
 
				+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
			
 
				+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
			
 
				+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
			
 
				+
			
 
				+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
			
 
				+
			
 
				+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
			
 
				+    const int64_t IH = is_2D ? src1->ne[1] : 1;
			
 
				+    const int64_t IW =         src1->ne[0];
			
 
				+
			
 
				+    const int64_t KH = is_2D ? src0->ne[1] : 1;
			
 
				+    const int64_t KW =         src0->ne[0];
			
 
				+
			
 
				+    const int64_t OH = is_2D ? dst->ne[2] : 1;
			
 
				+    const int64_t OW =         dst->ne[1];
			
 
				+
			
 
				+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
			
 
				+    const int64_t batch = src1->ne[3];
			
 
				+    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
			
 
				+
			
 
				+    if(dst->type == GGML_TYPE_F16) {
			
 
				+        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
			
 
				+    } else {
			
 
				+        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
			
 
				+    }
			
 
				+}
			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_IM2COL_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_IM2COL_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,9 +1,9 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-void ggml_cuda_op_mul_mat_q(

			
 
				-    ggml_backend_cuda_context & ctx,

			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);

			
 
				-

			
 
				-bool ggml_cuda_supports_mmq(enum ggml_type type);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_op_mul_mat_q(
			
 
				+    ggml_backend_cuda_context & ctx,
			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				+    const int64_t src1_padded_row_size, cudaStream_t stream);
			
 
				+
			
 
				+bool ggml_cuda_supports_mmq(enum ggml_type type);
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,7 +1,7 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-void ggml_cuda_op_mul_mat_vec_q(

			
 
				-    ggml_backend_cuda_context & ctx,

			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_op_mul_mat_vec_q(
			
 
				+    ggml_backend_cuda_context & ctx,
			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				+    const int64_t src1_padded_row_size, cudaStream_t stream);
			
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,215 +1,215 @@
 
				-#include "norm.cuh"

			
 
				-

			
 
				-template <int block_size>

			
 
				-static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {

			
 
				-    const int row = blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-    const int tid = threadIdx.x;

			
 
				-

			
 
				-    float2 mean_var = make_float2(0.f, 0.f);

			
 
				-

			
 
				-    for (int col = tid; col < ncols; col += block_size) {

			
 
				-        const float xi = x[row*ncols + col];

			
 
				-        mean_var.x += xi;

			
 
				-        mean_var.y += xi * xi;

			
 
				-    }

			
 
				-

			
 
				-    // sum up partial sums

			
 
				-    mean_var = warp_reduce_sum(mean_var);

			
 
				-    if (block_size > WARP_SIZE) {

			
 
				-        __shared__ float2 s_sum[32];

			
 
				-        int warp_id = threadIdx.x / WARP_SIZE;

			
 
				-        int lane_id = threadIdx.x % WARP_SIZE;

			
 
				-        if (lane_id == 0) {

			
 
				-            s_sum[warp_id] = mean_var;

			
 
				-        }

			
 
				-        __syncthreads();

			
 
				-        mean_var = s_sum[lane_id];

			
 
				-        mean_var = warp_reduce_sum(mean_var);

			
 
				-    }

			
 
				-

			
 
				-    const float mean = mean_var.x / ncols;

			
 
				-    const float var = mean_var.y / ncols - mean * mean;

			
 
				-    const float inv_std = rsqrtf(var + eps);

			
 
				-

			
 
				-    for (int col = tid; col < ncols; col += block_size) {

			
 
				-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-template <int block_size>

			
 
				-static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {

			
 
				-    // blockIdx.x: num_groups idx

			
 
				-    // threadIdx.x: block_size idx

			
 
				-    int start = blockIdx.x * group_size;

			
 
				-    int end = start + group_size;

			
 
				-

			
 
				-    start += threadIdx.x;

			
 
				-

			
 
				-    if (end >= ne_elements) {

			
 
				-        end = ne_elements;

			
 
				-    }

			
 
				-

			
 
				-    float tmp = 0.0f; // partial sum for thread in warp

			
 
				-

			
 
				-    for (int j = start; j < end; j += block_size) {

			
 
				-        tmp += x[j];

			
 
				-    }

			
 
				-

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-    if (block_size > WARP_SIZE) {

			
 
				-        __shared__ float s_sum[32];

			
 
				-        int warp_id = threadIdx.x / WARP_SIZE;

			
 
				-        int lane_id = threadIdx.x % WARP_SIZE;

			
 
				-        if (lane_id == 0) {

			
 
				-            s_sum[warp_id] = tmp;

			
 
				-        }

			
 
				-        __syncthreads();

			
 
				-        tmp = s_sum[lane_id];

			
 
				-        tmp = warp_reduce_sum(tmp);

			
 
				-    }

			
 
				-

			
 
				-    float mean = tmp / group_size;

			
 
				-    tmp = 0.0f;

			
 
				-

			
 
				-    for (int j = start; j < end; j += block_size) {

			
 
				-        float xi = x[j] - mean;

			
 
				-        dst[j] = xi;

			
 
				-        tmp += xi * xi;

			
 
				-    }

			
 
				-

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-    if (block_size > WARP_SIZE) {

			
 
				-        __shared__ float s_sum[32];

			
 
				-        int warp_id = threadIdx.x / WARP_SIZE;

			
 
				-        int lane_id = threadIdx.x % WARP_SIZE;

			
 
				-        if (lane_id == 0) {

			
 
				-            s_sum[warp_id] = tmp;

			
 
				-        }

			
 
				-        __syncthreads();

			
 
				-        tmp = s_sum[lane_id];

			
 
				-        tmp = warp_reduce_sum(tmp);

			
 
				-    }

			
 
				-

			
 
				-    float variance = tmp / group_size;

			
 
				-    float scale = rsqrtf(variance + eps);

			
 
				-    for (int j = start; j < end; j += block_size) {

			
 
				-        dst[j] *= scale;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-template <int block_size>

			
 
				-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {

			
 
				-    const int row = blockIdx.x*blockDim.y + threadIdx.y;

			
 
				-    const int tid = threadIdx.x;

			
 
				-

			
 
				-    float tmp = 0.0f; // partial sum for thread in warp

			
 
				-

			
 
				-    for (int col = tid; col < ncols; col += block_size) {

			
 
				-        const float xi = x[row*ncols + col];

			
 
				-        tmp += xi * xi;

			
 
				-    }

			
 
				-

			
 
				-    // sum up partial sums

			
 
				-    tmp = warp_reduce_sum(tmp);

			
 
				-    if (block_size > WARP_SIZE) {

			
 
				-        __shared__ float s_sum[32];

			
 
				-        int warp_id = threadIdx.x / WARP_SIZE;

			
 
				-        int lane_id = threadIdx.x % WARP_SIZE;

			
 
				-        if (lane_id == 0) {

			
 
				-            s_sum[warp_id] = tmp;

			
 
				-        }

			
 
				-        __syncthreads();

			
 
				-        tmp = s_sum[lane_id];

			
 
				-        tmp = warp_reduce_sum(tmp);

			
 
				-    }

			
 
				-

			
 
				-    const float mean = tmp / ncols;

			
 
				-    const float scale = rsqrtf(mean + eps);

			
 
				-

			
 
				-    for (int col = tid; col < ncols; col += block_size) {

			
 
				-        dst[row*ncols + col] = scale * x[row*ncols + col];

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % WARP_SIZE == 0);

			
 
				-    if (ncols < 1024) {

			
 
				-        const dim3 block_dims(WARP_SIZE, 1, 1);

			
 
				-        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);

			
 
				-    } else {

			
 
				-        const dim3 block_dims(1024, 1, 1);

			
 
				-        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {

			
 
				-    static const float eps = 1e-6f;

			
 
				-    if (group_size < 1024) {

			
 
				-        const dim3 block_dims(WARP_SIZE, 1, 1);

			
 
				-        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);

			
 
				-    } else {

			
 
				-        const dim3 block_dims(1024, 1, 1);

			
 
				-        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {

			
 
				-    GGML_ASSERT(ncols % WARP_SIZE == 0);

			
 
				-    if (ncols < 1024) {

			
 
				-        const dim3 block_dims(WARP_SIZE, 1, 1);

			
 
				-        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);

			
 
				-    } else {

			
 
				-        const dim3 block_dims(1024, 1, 1);

			
 
				-        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    const int64_t ne00 = src0->ne[0];

			
 
				-    const int64_t nrows = ggml_nrows(src0);

			
 
				-

			
 
				-    float eps;

			
 
				-    memcpy(&eps, dst->op_params, sizeof(float));

			
 
				-

			
 
				-    norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    int num_groups = dst->op_params[0];

			
 
				-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);

			
 
				-    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    const int64_t ne00 = src0->ne[0];

			
 
				-    const int64_t nrows = ggml_nrows(src0);

			
 
				-

			
 
				-    float eps;

			
 
				-    memcpy(&eps, dst->op_params, sizeof(float));

			
 
				-

			
 
				-    rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);

			
 
				-}

			
 
				+#include "norm.cuh"
			
 
				+
			
 
				+template <int block_size>
			
 
				+static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) {
			
 
				+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+    const int tid = threadIdx.x;
			
 
				+
			
 
				+    float2 mean_var = make_float2(0.f, 0.f);
			
 
				+
			
 
				+    for (int col = tid; col < ncols; col += block_size) {
			
 
				+        const float xi = x[row*ncols + col];
			
 
				+        mean_var.x += xi;
			
 
				+        mean_var.y += xi * xi;
			
 
				+    }
			
 
				+
			
 
				+    // sum up partial sums
			
 
				+    mean_var = warp_reduce_sum(mean_var);
			
 
				+    if (block_size > WARP_SIZE) {
			
 
				+        __shared__ float2 s_sum[32];
			
 
				+        int warp_id = threadIdx.x / WARP_SIZE;
			
 
				+        int lane_id = threadIdx.x % WARP_SIZE;
			
 
				+        if (lane_id == 0) {
			
 
				+            s_sum[warp_id] = mean_var;
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+        mean_var = s_sum[lane_id];
			
 
				+        mean_var = warp_reduce_sum(mean_var);
			
 
				+    }
			
 
				+
			
 
				+    const float mean = mean_var.x / ncols;
			
 
				+    const float var = mean_var.y / ncols - mean * mean;
			
 
				+    const float inv_std = rsqrtf(var + eps);
			
 
				+
			
 
				+    for (int col = tid; col < ncols; col += block_size) {
			
 
				+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <int block_size>
			
 
				+static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
			
 
				+    // blockIdx.x: num_groups idx
			
 
				+    // threadIdx.x: block_size idx
			
 
				+    int start = blockIdx.x * group_size;
			
 
				+    int end = start + group_size;
			
 
				+
			
 
				+    start += threadIdx.x;
			
 
				+
			
 
				+    if (end >= ne_elements) {
			
 
				+        end = ne_elements;
			
 
				+    }
			
 
				+
			
 
				+    float tmp = 0.0f; // partial sum for thread in warp
			
 
				+
			
 
				+    for (int j = start; j < end; j += block_size) {
			
 
				+        tmp += x[j];
			
 
				+    }
			
 
				+
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+    if (block_size > WARP_SIZE) {
			
 
				+        __shared__ float s_sum[32];
			
 
				+        int warp_id = threadIdx.x / WARP_SIZE;
			
 
				+        int lane_id = threadIdx.x % WARP_SIZE;
			
 
				+        if (lane_id == 0) {
			
 
				+            s_sum[warp_id] = tmp;
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+        tmp = s_sum[lane_id];
			
 
				+        tmp = warp_reduce_sum(tmp);
			
 
				+    }
			
 
				+
			
 
				+    float mean = tmp / group_size;
			
 
				+    tmp = 0.0f;
			
 
				+
			
 
				+    for (int j = start; j < end; j += block_size) {
			
 
				+        float xi = x[j] - mean;
			
 
				+        dst[j] = xi;
			
 
				+        tmp += xi * xi;
			
 
				+    }
			
 
				+
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+    if (block_size > WARP_SIZE) {
			
 
				+        __shared__ float s_sum[32];
			
 
				+        int warp_id = threadIdx.x / WARP_SIZE;
			
 
				+        int lane_id = threadIdx.x % WARP_SIZE;
			
 
				+        if (lane_id == 0) {
			
 
				+            s_sum[warp_id] = tmp;
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+        tmp = s_sum[lane_id];
			
 
				+        tmp = warp_reduce_sum(tmp);
			
 
				+    }
			
 
				+
			
 
				+    float variance = tmp / group_size;
			
 
				+    float scale = rsqrtf(variance + eps);
			
 
				+    for (int j = start; j < end; j += block_size) {
			
 
				+        dst[j] *= scale;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <int block_size>
			
 
				+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
			
 
				+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
			
 
				+    const int tid = threadIdx.x;
			
 
				+
			
 
				+    float tmp = 0.0f; // partial sum for thread in warp
			
 
				+
			
 
				+    for (int col = tid; col < ncols; col += block_size) {
			
 
				+        const float xi = x[row*ncols + col];
			
 
				+        tmp += xi * xi;
			
 
				+    }
			
 
				+
			
 
				+    // sum up partial sums
			
 
				+    tmp = warp_reduce_sum(tmp);
			
 
				+    if (block_size > WARP_SIZE) {
			
 
				+        __shared__ float s_sum[32];
			
 
				+        int warp_id = threadIdx.x / WARP_SIZE;
			
 
				+        int lane_id = threadIdx.x % WARP_SIZE;
			
 
				+        if (lane_id == 0) {
			
 
				+            s_sum[warp_id] = tmp;
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+        tmp = s_sum[lane_id];
			
 
				+        tmp = warp_reduce_sum(tmp);
			
 
				+    }
			
 
				+
			
 
				+    const float mean = tmp / ncols;
			
 
				+    const float scale = rsqrtf(mean + eps);
			
 
				+
			
 
				+    for (int col = tid; col < ncols; col += block_size) {
			
 
				+        dst[row*ncols + col] = scale * x[row*ncols + col];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % WARP_SIZE == 0);
			
 
				+    if (ncols < 1024) {
			
 
				+        const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+        norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
			
 
				+    } else {
			
 
				+        const dim3 block_dims(1024, 1, 1);
			
 
				+        norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
			
 
				+    static const float eps = 1e-6f;
			
 
				+    if (group_size < 1024) {
			
 
				+        const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+        group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
			
 
				+    } else {
			
 
				+        const dim3 block_dims(1024, 1, 1);
			
 
				+        group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
			
 
				+    GGML_ASSERT(ncols % WARP_SIZE == 0);
			
 
				+    if (ncols < 1024) {
			
 
				+        const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+        rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
			
 
				+    } else {
			
 
				+        const dim3 block_dims(1024, 1, 1);
			
 
				+        rms_norm_f32<1024><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    const int64_t ne00 = src0->ne[0];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    float eps;
			
 
				+    memcpy(&eps, dst->op_params, sizeof(float));
			
 
				+
			
 
				+    norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    int num_groups = dst->op_params[0];
			
 
				+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
			
 
				+    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    const int64_t ne00 = src0->ne[0];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    float eps;
			
 
				+    memcpy(&eps, dst->op_params, sizeof(float));
			
 
				+
			
 
				+    rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,7 +1,7 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				-

			
 
				-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				-

			
 
				-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+
			
 
				+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+
			
 
				+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,49 +1,49 @@
 
				-#include "pad.cuh"

			
 
				-

			
 
				-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {

			
 
				-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03

			
 
				-    // blockIdx.y: idx of ne1

			
 
				-    // blockIDx.x: idx of ne0 / BLOCK_SIZE

			
 
				-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				-    if (nidx >= ne0) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    // operation

			
 
				-    int offset_dst =

			
 
				-        nidx +

			
 
				-        blockIdx.y * ne0 +

			
 
				-        blockIdx.z * ne0 * gridDim.y;

			
 
				-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {

			
 
				-        int offset_src =

			
 
				-            nidx +

			
 
				-            blockIdx.y * ne00 +

			
 
				-            blockIdx.z * ne00 * ne01;

			
 
				-        dst[offset_dst] = x[offset_src];

			
 
				-    } else {

			
 
				-        dst[offset_dst] = 0.0f;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void pad_f32_cuda(const float * x, float * dst,

			
 
				-    const int ne00, const int ne01, const int ne02, const int ne03,

			
 
				-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {

			
 
				-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;

			
 
				-    dim3 gridDim(num_blocks, ne1, ne2*ne3);

			
 
				-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

			
 
				-

			
 
				-    pad_f32_cuda(src0_d, dst_d,

			
 
				-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],

			
 
				-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);

			
 
				-}

			
 
				+#include "pad.cuh"
			
 
				+
			
 
				+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
			
 
				+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
			
 
				+    // blockIdx.y: idx of ne1
			
 
				+    // blockIDx.x: idx of ne0 / BLOCK_SIZE
			
 
				+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if (nidx >= ne0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // operation
			
 
				+    int offset_dst =
			
 
				+        nidx +
			
 
				+        blockIdx.y * ne0 +
			
 
				+        blockIdx.z * ne0 * gridDim.y;
			
 
				+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
			
 
				+        int offset_src =
			
 
				+            nidx +
			
 
				+            blockIdx.y * ne00 +
			
 
				+            blockIdx.z * ne00 * ne01;
			
 
				+        dst[offset_dst] = x[offset_src];
			
 
				+    } else {
			
 
				+        dst[offset_dst] = 0.0f;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void pad_f32_cuda(const float * x, float * dst,
			
 
				+    const int ne00, const int ne01, const int ne02, const int ne03,
			
 
				+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
			
 
				+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
			
 
				+    dim3 gridDim(num_blocks, ne1, ne2*ne3);
			
 
				+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
			
 
				+
			
 
				+    pad_f32_cuda(src0_d, dst_d,
			
 
				+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
			
 
				+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
			
 
				+}
			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_PAD_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_PAD_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,94 +1,94 @@
 
				-#include "pool2d.cuh"

			
 
				-

			
 
				-template <typename Ti, typename To>

			
 
				-static  __global__ void pool2d_nchw_kernel(

			
 
				-        const int ih, const int iw, const int oh, const int ow,

			
 
				-        const int kh, const int kw, const int sh, const int sw,

			
 
				-        const int ph, const int pw, const int parallel_elements,

			
 
				-        const Ti* src, To* dst, const enum ggml_op_pool op) {

			
 
				-    int idx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				-    if (idx >= parallel_elements) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int I_HW = ih * iw;

			
 
				-    const int O_HW = oh * ow;

			
 
				-    const int nc = idx / O_HW;

			
 
				-    const int cur_oh = idx % O_HW / ow;

			
 
				-    const int cur_ow = idx % O_HW % ow;

			
 
				-    const Ti* i_ptr = src + nc * I_HW;

			
 
				-    To* o_ptr = dst + nc * O_HW;

			
 
				-    const int start_h = cur_oh * sh - ph;

			
 
				-    const int bh = max(0, start_h);

			
 
				-    const int eh = min(ih, start_h + kh);

			
 
				-    const int start_w = cur_ow * sw - pw;

			
 
				-    const int bw = max(0, start_w);

			
 
				-    const int ew = min(iw, start_w + kw);

			
 
				-    const To scale = 1. / (kh * kw);

			
 
				-    To res = 0;

			
 
				-

			
 
				-    switch (op) {

			
 
				-        case GGML_OP_POOL_AVG: res = 0; break;

			
 
				-        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;

			
 
				-        default: assert(false);

			
 
				-    }

			
 
				-

			
 
				-    for (int i = bh; i < eh; i += 1) {

			
 
				-        for (int j = bw; j < ew; j += 1) {

			
 
				-#if __CUDA_ARCH__ >= 350

			
 
				-            Ti cur = __ldg(i_ptr + i * iw + j);

			
 
				-#else

			
 
				-            Ti cur = i_ptr[i * iw + j];

			
 
				-#endif

			
 
				-            switch (op) {

			
 
				-                case GGML_OP_POOL_AVG: res += cur * scale; break;

			
 
				-                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;

			
 
				-                default: assert(false);

			
 
				-            }

			
 
				-        }

			
 
				-    }

			
 
				-    o_ptr[cur_oh * ow + cur_ow] = res;

			
 
				-}

			
 
				-

			
 
				-static void pool2d_nchw_kernel_f32_f32_cuda(

			
 
				-        const int ih, const int iw, const int oh, const int ow,

			
 
				-        const int kh, const int kw, const int sh, const int sw,

			
 
				-        const int ph, const int pw, const int parallel_elements,

			
 
				-        const float * src, float * dst, const enum ggml_op_pool op,

			
 
				-        cudaStream_t stream) {

			
 
				-

			
 
				-    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;

			
 
				-    dim3 block_nums(num_blocks);

			
 
				-    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    const int32_t * opts = (const int32_t *)dst->op_params;

			
 
				-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);

			
 
				-    const int k0 = opts[1];

			
 
				-    const int k1 = opts[2];

			
 
				-    const int s0 = opts[3];

			
 
				-    const int s1 = opts[4];

			
 
				-    const int p0 = opts[5];

			
 
				-    const int p1 = opts[6];

			
 
				-

			
 
				-    const int64_t IH = src0->ne[1];

			
 
				-    const int64_t IW = src0->ne[0];

			
 
				-

			
 
				-    const int64_t N = dst->ne[3];

			
 
				-    const int64_t OC = dst->ne[2];

			
 
				-    const int64_t OH = dst->ne[1];

			
 
				-    const int64_t OW = dst->ne[0];

			
 
				-

			
 
				-    const int parallel_elements = N * OC * OH * OW;

			
 
				-

			
 
				-    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);

			
 
				-}

			
 
				+#include "pool2d.cuh"
			
 
				+
			
 
				+template <typename Ti, typename To>
			
 
				+static  __global__ void pool2d_nchw_kernel(
			
 
				+        const int ih, const int iw, const int oh, const int ow,
			
 
				+        const int kh, const int kw, const int sh, const int sw,
			
 
				+        const int ph, const int pw, const int parallel_elements,
			
 
				+        const Ti* src, To* dst, const enum ggml_op_pool op) {
			
 
				+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    if (idx >= parallel_elements) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int I_HW = ih * iw;
			
 
				+    const int O_HW = oh * ow;
			
 
				+    const int nc = idx / O_HW;
			
 
				+    const int cur_oh = idx % O_HW / ow;
			
 
				+    const int cur_ow = idx % O_HW % ow;
			
 
				+    const Ti* i_ptr = src + nc * I_HW;
			
 
				+    To* o_ptr = dst + nc * O_HW;
			
 
				+    const int start_h = cur_oh * sh - ph;
			
 
				+    const int bh = max(0, start_h);
			
 
				+    const int eh = min(ih, start_h + kh);
			
 
				+    const int start_w = cur_ow * sw - pw;
			
 
				+    const int bw = max(0, start_w);
			
 
				+    const int ew = min(iw, start_w + kw);
			
 
				+    const To scale = 1. / (kh * kw);
			
 
				+    To res = 0;
			
 
				+
			
 
				+    switch (op) {
			
 
				+        case GGML_OP_POOL_AVG: res = 0; break;
			
 
				+        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
			
 
				+        default: assert(false);
			
 
				+    }
			
 
				+
			
 
				+    for (int i = bh; i < eh; i += 1) {
			
 
				+        for (int j = bw; j < ew; j += 1) {
			
 
				+#if __CUDA_ARCH__ >= 350
			
 
				+            Ti cur = __ldg(i_ptr + i * iw + j);
			
 
				+#else
			
 
				+            Ti cur = i_ptr[i * iw + j];
			
 
				+#endif
			
 
				+            switch (op) {
			
 
				+                case GGML_OP_POOL_AVG: res += cur * scale; break;
			
 
				+                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
			
 
				+                default: assert(false);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    o_ptr[cur_oh * ow + cur_ow] = res;
			
 
				+}
			
 
				+
			
 
				+static void pool2d_nchw_kernel_f32_f32_cuda(
			
 
				+        const int ih, const int iw, const int oh, const int ow,
			
 
				+        const int kh, const int kw, const int sh, const int sw,
			
 
				+        const int ph, const int pw, const int parallel_elements,
			
 
				+        const float * src, float * dst, const enum ggml_op_pool op,
			
 
				+        cudaStream_t stream) {
			
 
				+
			
 
				+    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
			
 
				+    dim3 block_nums(num_blocks);
			
 
				+    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    const int32_t * opts = (const int32_t *)dst->op_params;
			
 
				+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
			
 
				+    const int k0 = opts[1];
			
 
				+    const int k1 = opts[2];
			
 
				+    const int s0 = opts[3];
			
 
				+    const int s1 = opts[4];
			
 
				+    const int p0 = opts[5];
			
 
				+    const int p1 = opts[6];
			
 
				+
			
 
				+    const int64_t IH = src0->ne[1];
			
 
				+    const int64_t IW = src0->ne[0];
			
 
				+
			
 
				+    const int64_t N = dst->ne[3];
			
 
				+    const int64_t OC = dst->ne[2];
			
 
				+    const int64_t OH = dst->ne[1];
			
 
				+    const int64_t OW = dst->ne[0];
			
 
				+
			
 
				+    const int parallel_elements = N * OC * OH * OW;
			
 
				+
			
 
				+    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_POOL2D_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_POOL2D_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,45 +1,45 @@
 
				-#include "quantize.cuh"

			
 
				-

			
 
				-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {

			
 
				-    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-

			
 
				-    if (ix >= kx_padded) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;

			
 
				-

			
 
				-    const int64_t i_padded = (int64_t)iy*kx_padded + ix;

			
 
				-

			
 
				-    block_q8_1 * y = (block_q8_1 *) vy;

			
 
				-

			
 
				-    const int64_t ib = i_padded / QK8_1; // block index

			
 
				-    const int64_t iqs = i_padded % QK8_1; // quant index

			
 
				-

			
 
				-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;

			
 
				-    float amax = fabsf(xi);

			
 
				-    float sum = xi;

			
 
				-

			
 
				-    amax = warp_reduce_max(amax);

			
 
				-    sum = warp_reduce_sum(sum);

			
 
				-

			
 
				-    const float d = amax / 127;

			
 
				-    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);

			
 
				-

			
 
				-    y[ib].qs[iqs] = q;

			
 
				-

			
 
				-    if (iqs > 0) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    reinterpret_cast<half&>(y[ib].ds.x) = d;

			
 
				-    reinterpret_cast<half&>(y[ib].ds.y) = sum;

			
 
				-}

			
 
				-

			
 
				-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {

			
 
				-    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;

			
 
				-    const dim3 num_blocks(block_num_x, ky, 1);

			
 
				-    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);

			
 
				-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);

			
 
				-}

			
 
				-

			
 
				+#include "quantize.cuh"
			
 
				+
			
 
				+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
			
 
				+    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+
			
 
				+    if (ix >= kx_padded) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
			
 
				+
			
 
				+    const int64_t i_padded = (int64_t)iy*kx_padded + ix;
			
 
				+
			
 
				+    block_q8_1 * y = (block_q8_1 *) vy;
			
 
				+
			
 
				+    const int64_t ib = i_padded / QK8_1; // block index
			
 
				+    const int64_t iqs = i_padded % QK8_1; // quant index
			
 
				+
			
 
				+    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
			
 
				+    float amax = fabsf(xi);
			
 
				+    float sum = xi;
			
 
				+
			
 
				+    amax = warp_reduce_max(amax);
			
 
				+    sum = warp_reduce_sum(sum);
			
 
				+
			
 
				+    const float d = amax / 127;
			
 
				+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
			
 
				+
			
 
				+    y[ib].qs[iqs] = q;
			
 
				+
			
 
				+    if (iqs > 0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    reinterpret_cast<half&>(y[ib].ds.x) = d;
			
 
				+    reinterpret_cast<half&>(y[ib].ds.y) = sum;
			
 
				+}
			
 
				+
			
 
				+void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
			
 
				+    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
			
 
				+    const dim3 num_blocks(block_num_x, ky, 1);
			
 
				+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
			
 
				+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
			
 
				+}
			
 
				+
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_QUANTIZE_BLOCK_SIZE 256

			
 
				-

			
 
				-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_QUANTIZE_BLOCK_SIZE 256
			
 
				+
			
 
				+void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
			
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,308 +1,308 @@
 
				-#include "rope.cuh"

			
 
				-

			
 
				-struct rope_corr_dims {

			
 
				-    float v[4];

			
 
				-};

			
 
				-

			
 
				-static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {

			
 
				-    const float y = (i0 / 2 - low) / max(0.001f, high - low);

			
 
				-    return 1.0f - min(1.0f, max(0.0f, y));

			
 
				-}

			
 
				-

			
 
				-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn

			
 
				-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.

			
 
				-static __device__ void rope_yarn(

			
 
				-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,

			
 
				-    float * cos_theta, float * sin_theta

			
 
				-) {

			
 
				-    // Get n-d rotational scaling corrected for extrapolation

			
 
				-    float theta_interp = freq_scale * theta_extrap;

			
 
				-    float theta = theta_interp;

			
 
				-    if (ext_factor != 0.0f) {

			
 
				-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;

			
 
				-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

			
 
				-

			
 
				-        // Get n-d magnitude scaling corrected for interpolation

			
 
				-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);

			
 
				-    }

			
 
				-    *cos_theta = cosf(theta) * mscale;

			
 
				-    *sin_theta = sinf(theta) * mscale;

			
 
				-}

			
 
				-

			
 
				-// rope == RoPE == rotary positional embedding

			
 
				-template<typename T, bool has_pos>

			
 
				-static __global__ void rope(

			
 
				-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,

			
 
				-    float ext_factor, float attn_factor, rope_corr_dims corr_dims

			
 
				-) {

			
 
				-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

			
 
				-

			
 
				-    if (col >= ncols) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int row = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-    const int i = row*ncols + col;

			
 
				-    const int i2 = row/p_delta_rows;

			
 
				-

			
 
				-    const int p = has_pos ? pos[i2] : 0;

			
 
				-    const float theta_base = p*powf(freq_base, -float(col)/ncols);

			
 
				-

			
 
				-    float cos_theta, sin_theta;

			
 
				-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);

			
 
				-

			
 
				-    const float x0 = x[i + 0];

			
 
				-    const float x1 = x[i + 1];

			
 
				-

			
 
				-    dst[i + 0] = x0*cos_theta - x1*sin_theta;

			
 
				-    dst[i + 1] = x0*sin_theta + x1*cos_theta;

			
 
				-}

			
 
				-

			
 
				-template<typename T, bool has_pos>

			
 
				-static __global__ void rope_neox(

			
 
				-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims

			
 
				-) {

			
 
				-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

			
 
				-

			
 
				-    if (col >= ncols) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int row = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-    const int ib = col / n_dims;

			
 
				-    const int ic = col % n_dims;

			
 
				-

			
 
				-    if (ib > 0) {

			
 
				-        const int i = row*ncols + ib*n_dims + ic;

			
 
				-

			
 
				-        dst[i + 0] = x[i + 0];

			
 
				-        dst[i + 1] = x[i + 1];

			
 
				-

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int i  = row*ncols + ib*n_dims + ic/2;

			
 
				-    const int i2 = row/p_delta_rows;

			
 
				-

			
 
				-    float cur_rot = inv_ndims * ic - ib;

			
 
				-

			
 
				-    const int p = has_pos ? pos[i2] : 0;

			
 
				-    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);

			
 
				-

			
 
				-    float cos_theta, sin_theta;

			
 
				-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);

			
 
				-

			
 
				-    const float x0 = x[i + 0];

			
 
				-    const float x1 = x[i + n_dims/2];

			
 
				-

			
 
				-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;

			
 
				-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;

			
 
				-}

			
 
				-

			
 
				-static __global__ void rope_glm_f32(

			
 
				-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,

			
 
				-    int n_ctx

			
 
				-) {

			
 
				-    const int col = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				-    const int half_n_dims = ncols/4;

			
 
				-

			
 
				-    if (col >= half_n_dims) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    const int row = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				-    const int i = row*ncols + col;

			
 
				-    const int i2 = row/p_delta_rows;

			
 
				-

			
 
				-    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);

			
 
				-     // FIXME: this is likely wrong

			
 
				-    const int p = pos != nullptr ? pos[i2] : 0;

			
 
				-

			
 
				-    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;

			
 
				-    const float sin_theta = sinf(theta);

			
 
				-    const float cos_theta = cosf(theta);

			
 
				-

			
 
				-    const float x0 = x[i + 0];

			
 
				-    const float x1 = x[i + half_n_dims];

			
 
				-

			
 
				-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;

			
 
				-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;

			
 
				-

			
 
				-    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;

			
 
				-    const float sin_block_theta = sinf(block_theta);

			
 
				-    const float cos_block_theta = cosf(block_theta);

			
 
				-

			
 
				-    const float x2 = x[i + half_n_dims * 2];

			
 
				-    const float x3 = x[i + half_n_dims * 3];

			
 
				-

			
 
				-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;

			
 
				-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;

			
 
				-}

			
 
				-

			
 
				-

			
 
				-template<typename T>

			
 
				-static void rope_cuda(

			
 
				-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream

			
 
				-) {

			
 
				-    GGML_ASSERT(ncols % 2 == 0);

			
 
				-    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);

			
 
				-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);

			
 
				-    const dim3 block_nums(nrows, num_blocks_x, 1);

			
 
				-    if (pos == nullptr) {

			
 
				-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(

			
 
				-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims

			
 
				-        );

			
 
				-    } else {

			
 
				-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(

			
 
				-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims

			
 
				-        );

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-template<typename T>

			
 
				-static void rope_neox_cuda(

			
 
				-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream

			
 
				-) {

			
 
				-    GGML_ASSERT(ncols % 2 == 0);

			
 
				-    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);

			
 
				-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);

			
 
				-    const dim3 block_nums(nrows, num_blocks_x, 1);

			
 
				-

			
 
				-    const float theta_scale = powf(freq_base, -2.0f/n_dims);

			
 
				-    const float inv_ndims = -1.0f / n_dims;

			
 
				-

			
 
				-    if (pos == nullptr) {

			
 
				-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(

			
 
				-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,

			
 
				-            theta_scale, inv_ndims

			
 
				-        );

			
 
				-    } else {

			
 
				-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(

			
 
				-            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,

			
 
				-            theta_scale, inv_ndims

			
 
				-        );

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void rope_glm_f32_cuda(

			
 
				-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, int n_ctx, cudaStream_t stream

			
 
				-) {

			
 
				-    GGML_ASSERT(ncols % 4 == 0);

			
 
				-    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);

			
 
				-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;

			
 
				-    const dim3 block_nums(num_blocks_x, nrows, 1);

			
 
				-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);

			
 
				-}

			
 
				-

			
 
				-static void rope_cuda_f16(

			
 
				-    const half * x, half * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {

			
 
				-

			
 
				-    rope_cuda<half>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);

			
 
				-}

			
 
				-

			
 
				-static void rope_cuda_f32(

			
 
				-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {

			
 
				-

			
 
				-    rope_cuda<float>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);

			
 
				-}

			
 
				-

			
 
				-static void rope_neox_cuda_f16(

			
 
				-    const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {

			
 
				-

			
 
				-    rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);

			
 
				-}

			
 
				-

			
 
				-static void rope_neox_cuda_f32(

			
 
				-    const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,

			
 
				-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream

			
 
				-) {

			
 
				-

			
 
				-    rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const ggml_tensor * src1 = dst->src[1];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    const float * src1_d = (const float *)src1->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);

			
 
				-    GGML_ASSERT(src0->type == dst->type);

			
 
				-

			
 
				-    const int64_t ne00 = src0->ne[0];

			
 
				-    const int64_t ne01 = src0->ne[1];

			
 
				-    const int64_t ne2 = dst->ne[2];

			
 
				-    const int64_t nrows = ggml_nrows(src0);

			
 
				-

			
 
				-    //const int n_past      = ((int32_t *) dst->op_params)[0];

			
 
				-    const int n_dims      = ((int32_t *) dst->op_params)[1];

			
 
				-    const int mode        = ((int32_t *) dst->op_params)[2];

			
 
				-    const int n_ctx       = ((int32_t *) dst->op_params)[3];

			
 
				-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];

			
 
				-

			
 
				-    // RoPE alteration for extended context

			
 
				-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;

			
 
				-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));

			
 
				-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));

			
 
				-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));

			
 
				-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));

			
 
				-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));

			
 
				-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));

			
 
				-

			
 
				-    const int32_t * pos = nullptr;

			
 
				-    if ((mode & 1) == 0) {

			
 
				-        GGML_ASSERT(src1->type == GGML_TYPE_I32);

			
 
				-        GGML_ASSERT(src1->ne[0] == ne2);

			
 
				-        pos = (const int32_t *) src1_d;

			
 
				-    }

			
 
				-

			
 
				-    const bool is_neox = mode & 2;

			
 
				-    const bool is_glm  = mode & 4;

			
 
				-

			
 
				-    rope_corr_dims corr_dims;

			
 
				-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);

			
 
				-

			
 
				-    // compute

			
 
				-    if (is_glm) {

			
 
				-        GGML_ASSERT(false);

			
 
				-        rope_glm_f32_cuda(src0_d, dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, stream);

			
 
				-    } else if (is_neox) {

			
 
				-        if (src0->type == GGML_TYPE_F32) {

			
 
				-            rope_neox_cuda_f32(

			
 
				-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,

			
 
				-                attn_factor, corr_dims, stream

			
 
				-            );

			
 
				-        } else if (src0->type == GGML_TYPE_F16) {

			
 
				-            rope_neox_cuda_f16(

			
 
				-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,

			
 
				-                attn_factor, corr_dims, stream

			
 
				-            );

			
 
				-        } else {

			
 
				-            GGML_ASSERT(false);

			
 
				-        }

			
 
				-    } else {

			
 
				-        if (src0->type == GGML_TYPE_F32) {

			
 
				-            rope_cuda_f32(

			
 
				-                (const float *)src0_d, (float *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,

			
 
				-                attn_factor, corr_dims, stream

			
 
				-            );

			
 
				-        } else if (src0->type == GGML_TYPE_F16) {

			
 
				-            rope_cuda_f16(

			
 
				-                (const half *)src0_d, (half *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,

			
 
				-                attn_factor, corr_dims, stream

			
 
				-            );

			
 
				-        } else {

			
 
				-            GGML_ASSERT(false);

			
 
				-        }

			
 
				-    }

			
 
				-}

			
 
				+#include "rope.cuh"
			
 
				+
			
 
				+struct rope_corr_dims {
			
 
				+    float v[4];
			
 
				+};
			
 
				+
			
 
				+static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
			
 
				+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
			
 
				+    return 1.0f - min(1.0f, max(0.0f, y));
			
 
				+}
			
 
				+
			
 
				+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
			
 
				+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
			
 
				+static __device__ void rope_yarn(
			
 
				+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
			
 
				+    float * cos_theta, float * sin_theta
			
 
				+) {
			
 
				+    // Get n-d rotational scaling corrected for extrapolation
			
 
				+    float theta_interp = freq_scale * theta_extrap;
			
 
				+    float theta = theta_interp;
			
 
				+    if (ext_factor != 0.0f) {
			
 
				+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
			
 
				+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
			
 
				+
			
 
				+        // Get n-d magnitude scaling corrected for interpolation
			
 
				+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
			
 
				+    }
			
 
				+    *cos_theta = cosf(theta) * mscale;
			
 
				+    *sin_theta = sinf(theta) * mscale;
			
 
				+}
			
 
				+
			
 
				+// rope == RoPE == rotary positional embedding
			
 
				+template<typename T, bool has_pos>
			
 
				+static __global__ void rope(
			
 
				+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
			
 
				+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
			
 
				+) {
			
 
				+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
			
 
				+
			
 
				+    if (col >= ncols) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+    const int i = row*ncols + col;
			
 
				+    const int i2 = row/p_delta_rows;
			
 
				+
			
 
				+    const int p = has_pos ? pos[i2] : 0;
			
 
				+    const float theta_base = p*powf(freq_base, -float(col)/ncols);
			
 
				+
			
 
				+    float cos_theta, sin_theta;
			
 
				+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+    const float x0 = x[i + 0];
			
 
				+    const float x1 = x[i + 1];
			
 
				+
			
 
				+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
			
 
				+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
			
 
				+}
			
 
				+
			
 
				+template<typename T, bool has_pos>
			
 
				+static __global__ void rope_neox(
			
 
				+    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
			
 
				+) {
			
 
				+    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
			
 
				+
			
 
				+    if (col >= ncols) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int row = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+    const int ib = col / n_dims;
			
 
				+    const int ic = col % n_dims;
			
 
				+
			
 
				+    if (ib > 0) {
			
 
				+        const int i = row*ncols + ib*n_dims + ic;
			
 
				+
			
 
				+        dst[i + 0] = x[i + 0];
			
 
				+        dst[i + 1] = x[i + 1];
			
 
				+
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i  = row*ncols + ib*n_dims + ic/2;
			
 
				+    const int i2 = row/p_delta_rows;
			
 
				+
			
 
				+    float cur_rot = inv_ndims * ic - ib;
			
 
				+
			
 
				+    const int p = has_pos ? pos[i2] : 0;
			
 
				+    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
			
 
				+
			
 
				+    float cos_theta, sin_theta;
			
 
				+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+    const float x0 = x[i + 0];
			
 
				+    const float x1 = x[i + n_dims/2];
			
 
				+
			
 
				+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
			
 
				+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
			
 
				+}
			
 
				+
			
 
				+static __global__ void rope_glm_f32(
			
 
				+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
			
 
				+    int n_ctx
			
 
				+) {
			
 
				+    const int col = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+    const int half_n_dims = ncols/4;
			
 
				+
			
 
				+    if (col >= half_n_dims) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int row = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				+    const int i = row*ncols + col;
			
 
				+    const int i2 = row/p_delta_rows;
			
 
				+
			
 
				+    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
			
 
				+     // FIXME: this is likely wrong
			
 
				+    const int p = pos != nullptr ? pos[i2] : 0;
			
 
				+
			
 
				+    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
			
 
				+    const float sin_theta = sinf(theta);
			
 
				+    const float cos_theta = cosf(theta);
			
 
				+
			
 
				+    const float x0 = x[i + 0];
			
 
				+    const float x1 = x[i + half_n_dims];
			
 
				+
			
 
				+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
			
 
				+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
			
 
				+
			
 
				+    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
			
 
				+    const float sin_block_theta = sinf(block_theta);
			
 
				+    const float cos_block_theta = cosf(block_theta);
			
 
				+
			
 
				+    const float x2 = x[i + half_n_dims * 2];
			
 
				+    const float x3 = x[i + half_n_dims * 3];
			
 
				+
			
 
				+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
			
 
				+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+template<typename T>
			
 
				+static void rope_cuda(
			
 
				+    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
			
 
				+) {
			
 
				+    GGML_ASSERT(ncols % 2 == 0);
			
 
				+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
			
 
				+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
			
 
				+    const dim3 block_nums(nrows, num_blocks_x, 1);
			
 
				+    if (pos == nullptr) {
			
 
				+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
			
 
				+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
			
 
				+        );
			
 
				+    } else {
			
 
				+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
			
 
				+            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
			
 
				+        );
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+static void rope_neox_cuda(
			
 
				+    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
			
 
				+) {
			
 
				+    GGML_ASSERT(ncols % 2 == 0);
			
 
				+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
			
 
				+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
			
 
				+    const dim3 block_nums(nrows, num_blocks_x, 1);
			
 
				+
			
 
				+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
			
 
				+    const float inv_ndims = -1.0f / n_dims;
			
 
				+
			
 
				+    if (pos == nullptr) {
			
 
				+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
			
 
				+            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
			
 
				+            theta_scale, inv_ndims
			
 
				+        );
			
 
				+    } else {
			
 
				+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
			
 
				+            x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
			
 
				+            theta_scale, inv_ndims
			
 
				+        );
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void rope_glm_f32_cuda(
			
 
				+    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, int n_ctx, cudaStream_t stream
			
 
				+) {
			
 
				+    GGML_ASSERT(ncols % 4 == 0);
			
 
				+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
			
 
				+    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
			
 
				+    const dim3 block_nums(num_blocks_x, nrows, 1);
			
 
				+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
			
 
				+}
			
 
				+
			
 
				+static void rope_cuda_f16(
			
 
				+    const half * x, half * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
			
 
				+
			
 
				+    rope_cuda<half>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
			
 
				+}
			
 
				+
			
 
				+static void rope_cuda_f32(
			
 
				+    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
			
 
				+
			
 
				+    rope_cuda<float>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
			
 
				+}
			
 
				+
			
 
				+static void rope_neox_cuda_f16(
			
 
				+    const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
			
 
				+
			
 
				+    rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
			
 
				+}
			
 
				+
			
 
				+static void rope_neox_cuda_f32(
			
 
				+    const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
			
 
				+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
			
 
				+) {
			
 
				+
			
 
				+    rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    const float * src1_d = (const float *)src1->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
			
 
				+    GGML_ASSERT(src0->type == dst->type);
			
 
				+
			
 
				+    const int64_t ne00 = src0->ne[0];
			
 
				+    const int64_t ne01 = src0->ne[1];
			
 
				+    const int64_t ne2 = dst->ne[2];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    //const int n_past      = ((int32_t *) dst->op_params)[0];
			
 
				+    const int n_dims      = ((int32_t *) dst->op_params)[1];
			
 
				+    const int mode        = ((int32_t *) dst->op_params)[2];
			
 
				+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
			
 
				+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
			
 
				+
			
 
				+    // RoPE alteration for extended context
			
 
				+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
			
 
				+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
			
 
				+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
			
 
				+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
			
 
				+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
			
 
				+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
			
 
				+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
			
 
				+
			
 
				+    const int32_t * pos = nullptr;
			
 
				+    if ((mode & 1) == 0) {
			
 
				+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
			
 
				+        GGML_ASSERT(src1->ne[0] == ne2);
			
 
				+        pos = (const int32_t *) src1_d;
			
 
				+    }
			
 
				+
			
 
				+    const bool is_neox = mode & 2;
			
 
				+    const bool is_glm  = mode & 4;
			
 
				+
			
 
				+    rope_corr_dims corr_dims;
			
 
				+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
			
 
				+
			
 
				+    // compute
			
 
				+    if (is_glm) {
			
 
				+        GGML_ASSERT(false);
			
 
				+        rope_glm_f32_cuda(src0_d, dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, stream);
			
 
				+    } else if (is_neox) {
			
 
				+        if (src0->type == GGML_TYPE_F32) {
			
 
				+            rope_neox_cuda_f32(
			
 
				+                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
			
 
				+                attn_factor, corr_dims, stream
			
 
				+            );
			
 
				+        } else if (src0->type == GGML_TYPE_F16) {
			
 
				+            rope_neox_cuda_f16(
			
 
				+                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
			
 
				+                attn_factor, corr_dims, stream
			
 
				+            );
			
 
				+        } else {
			
 
				+            GGML_ASSERT(false);
			
 
				+        }
			
 
				+    } else {
			
 
				+        if (src0->type == GGML_TYPE_F32) {
			
 
				+            rope_cuda_f32(
			
 
				+                (const float *)src0_d, (float *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
			
 
				+                attn_factor, corr_dims, stream
			
 
				+            );
			
 
				+        } else if (src0->type == GGML_TYPE_F16) {
			
 
				+            rope_cuda_f16(
			
 
				+                (const half *)src0_d, (half *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
			
 
				+                attn_factor, corr_dims, stream
			
 
				+            );
			
 
				+        } else {
			
 
				+            GGML_ASSERT(false);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_ROPE_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_ROPE_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_SCALE_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_SCALE_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024

			
 
				-

			
 
				-void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
			
 
				+
			
 
				+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,40 +1,40 @@
 
				-#include "sumrows.cuh"

			
 
				-

			
 
				-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {

			
 
				-    const int row = blockIdx.x;

			
 
				-    const int col = threadIdx.x;

			
 
				-

			
 
				-    float sum = 0.0f;

			
 
				-    for (int i = col; i < ncols; i += blockDim.x) {

			
 
				-        sum += x[row * ncols + i];

			
 
				-    }

			
 
				-

			
 
				-    sum = warp_reduce_sum(sum);

			
 
				-

			
 
				-    if (col == 0) {

			
 
				-        dst[row] = sum;

			
 
				-    }

			
 
				-}

			
 
				-

			
 
				-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				-    const dim3 block_dims(WARP_SIZE, 1, 1);

			
 
				-    const dim3 block_nums(nrows, 1, 1);

			
 
				-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(ggml_is_contiguous(src0));

			
 
				-

			
 
				-

			
 
				-    const int64_t ncols = src0->ne[0];

			
 
				-    const int64_t nrows = ggml_nrows(src0);

			
 
				-

			
 
				-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);

			
 
				-}

			
 
				+#include "sumrows.cuh"
			
 
				+
			
 
				+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
			
 
				+    const int row = blockIdx.x;
			
 
				+    const int col = threadIdx.x;
			
 
				+
			
 
				+    float sum = 0.0f;
			
 
				+    for (int i = col; i < ncols; i += blockDim.x) {
			
 
				+        sum += x[row * ncols + i];
			
 
				+    }
			
 
				+
			
 
				+    sum = warp_reduce_sum(sum);
			
 
				+
			
 
				+    if (col == 0) {
			
 
				+        dst[row] = sum;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+    const dim3 block_nums(nrows, 1, 1);
			
 
				+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+
			
 
				+
			
 
				+    const int64_t ncols = src0->ne[0];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,3 +1,3 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/tsembd.cu
+++ b/llama/ggml-cuda/tsembd.cu
@@ -1,47 +1,47 @@
 
				-#include "tsembd.cuh"

			
 
				-

			
 
				-static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {

			
 
				-    // blockIDx.y: idx of timesteps->ne[0]

			
 
				-    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE

			
 
				-    int i = blockIdx.y;

			
 
				-    int j = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				-    float * embed_data = (float *)((char *)dst +  i*nb1);

			
 
				-

			
 
				-    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {

			
 
				-        embed_data[dim] = 0.f;

			
 
				-    }

			
 
				-

			
 
				-    int half = dim / 2;

			
 
				-    if (j >= half) {

			
 
				-        return;

			
 
				-    }

			
 
				-

			
 
				-    float timestep = timesteps[i];

			
 
				-    float freq = (float)expf(-logf(max_period) * j / half);

			
 
				-    float arg = timestep * freq;

			
 
				-    embed_data[j] = cosf(arg);

			
 
				-    embed_data[j + half] = sinf(arg);

			
 
				-}

			
 
				-

			
 
				-static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,

			
 
				-                                        const int dim, const int max_period, cudaStream_t stream) {

			
 
				-    int half_ceil = (dim + 1) / 2;

			
 
				-    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;

			
 
				-    dim3 gridDim(num_blocks, ne00, 1);

			
 
				-    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);

			
 
				-}

			
 
				-

			
 
				-void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				-    const ggml_tensor * src0 = dst->src[0];

			
 
				-    const float * src0_d = (const float *)src0->data;

			
 
				-    float * dst_d = (float *)dst->data;

			
 
				-    cudaStream_t stream = ctx.stream();

			
 
				-

			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				-

			
 
				-    const int dim = dst->op_params[0];

			
 
				-    const int max_period = dst->op_params[1];

			
 
				-

			
 
				-    timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);

			
 
				-}

			
 
				+#include "tsembd.cuh"
			
 
				+
			
 
				+static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
			
 
				+    // blockIDx.y: idx of timesteps->ne[0]
			
 
				+    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
			
 
				+    int i = blockIdx.y;
			
 
				+    int j = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    float * embed_data = (float *)((char *)dst +  i*nb1);
			
 
				+
			
 
				+    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
			
 
				+        embed_data[dim] = 0.f;
			
 
				+    }
			
 
				+
			
 
				+    int half = dim / 2;
			
 
				+    if (j >= half) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    float timestep = timesteps[i];
			
 
				+    float freq = (float)expf(-logf(max_period) * j / half);
			
 
				+    float arg = timestep * freq;
			
 
				+    embed_data[j] = cosf(arg);
			
 
				+    embed_data[j + half] = sinf(arg);
			
 
				+}
			
 
				+
			
 
				+static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1,
			
 
				+                                        const int dim, const int max_period, cudaStream_t stream) {
			
 
				+    int half_ceil = (dim + 1) / 2;
			
 
				+    int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE;
			
 
				+    dim3 gridDim(num_blocks, ne00, 1);
			
 
				+    timestep_embedding_f32<<<gridDim, CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE, 0, stream>>>(x, dst, nb1, dim, max_period);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const float * src0_d = (const float *)src0->data;
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    const int dim = dst->op_params[0];
			
 
				+    const int max_period = dst->op_params[1];
			
 
				+
			
 
				+    timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/tsembd.cuh
+++ b/llama/ggml-cuda/tsembd.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/upscale.cuh
+++ b/llama/ggml-cuda/upscale.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-#define CUDA_UPSCALE_BLOCK_SIZE 256

			
 
				-

			
 
				-void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_UPSCALE_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/vecdotq.cuh
+++ b/llama/ggml-cuda/vecdotq.cuh
@@ -1,1280 +1,1280 @@
 
				-#include "common.cuh"

			
 
				-

			
 
				-static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {

			
 
				-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment

			
 
				-

			
 
				-    int x32 = 0;

			
 
				-    x32 |= x16[0] <<  0;

			
 
				-    x32 |= x16[1] << 16;

			
 
				-

			
 
				-    return x32;

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {

			
 
				-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment

			
 
				-

			
 
				-    int x32 = 0;

			
 
				-    x32 |= x16[0] <<  0;

			
 
				-    x32 |= x16[1] << 16;

			
 
				-

			
 
				-    return x32;

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {

			
 
				-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {

			
 
				-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment

			
 
				-}

			
 
				-

			
 
				-

			
 
				-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called

			
 
				-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

			
 
				-

			
 
				-#define VDR_Q4_0_Q8_1_MMVQ 2

			
 
				-#define VDR_Q4_0_Q8_1_MMQ  4

			
 
				-

			
 
				-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(

			
 
				-    const int * v, const int * u, const float & d4, const half2 & ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < vdr; ++i) {

			
 
				-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;

			
 
				-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

			
 
				-

			
 
				-        // SIMD dot product of quantized values

			
 
				-        sumi = __dp4a(vi0, u[2*i+0], sumi);

			
 
				-        sumi = __dp4a(vi1, u[2*i+1], sumi);

			
 
				-    }

			
 
				-

			
 
				-    const float2 ds8f = __half22float2(ds8);

			
 
				-

			
 
				-    // second part effectively subtracts 8 from each quant value

			
 
				-    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q4_1_Q8_1_MMVQ 2

			
 
				-#define VDR_Q4_1_Q8_1_MMQ  4

			
 
				-

			
 
				-template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(

			
 
				-    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < vdr; ++i) {

			
 
				-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;

			
 
				-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;

			
 
				-

			
 
				-        // SIMD dot product of quantized values

			
 
				-        sumi = __dp4a(vi0, u[2*i+0], sumi);

			
 
				-        sumi = __dp4a(vi1, u[2*i+1], sumi);

			
 
				-    }

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));

			
 
				-    const float d4d8 = tmp.x;

			
 
				-    const float m4s8 = tmp.y;

			
 
				-#else

			
 
				-    const float2 dm4f = __half22float2(dm4);

			
 
				-    const float2 ds8f = __half22float2(ds8);

			
 
				-    const float d4d8 = dm4f.x * ds8f.x;

			
 
				-    const float m4s8 = dm4f.y * ds8f.y;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-

			
 
				-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it

			
 
				-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q5_0_Q8_1_MMVQ 2

			
 
				-#define VDR_Q5_0_Q8_1_MMQ  4

			
 
				-

			
 
				-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(

			
 
				-    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < vdr; ++i) {

			
 
				-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits

			
 
				-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4

			
 
				-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12

			
 
				-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20

			
 
				-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28

			
 
				-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

			
 
				-

			
 
				-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits

			
 
				-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4

			
 
				-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12

			
 
				-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20

			
 
				-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28

			
 
				-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values

			
 
				-    }

			
 
				-

			
 
				-    const float2 ds8f = __half22float2(ds8);

			
 
				-

			
 
				-    // second part effectively subtracts 16 from each quant value

			
 
				-    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q5_1_Q8_1_MMVQ 2

			
 
				-#define VDR_Q5_1_Q8_1_MMQ  4

			
 
				-

			
 
				-template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(

			
 
				-    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < vdr; ++i) {

			
 
				-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits

			
 
				-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4

			
 
				-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12

			
 
				-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20

			
 
				-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28

			
 
				-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values

			
 
				-

			
 
				-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits

			
 
				-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4

			
 
				-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12

			
 
				-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20

			
 
				-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28

			
 
				-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values

			
 
				-    }

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));

			
 
				-    const float d5d8 = tmp.x;

			
 
				-    const float m5s8 = tmp.y;

			
 
				-#else

			
 
				-    const float2 dm5f = __half22float2(dm5);

			
 
				-    const float2 ds8f = __half22float2(ds8);

			
 
				-    const float d5d8 = dm5f.x * ds8f.x;

			
 
				-    const float m5s8 = dm5f.y * ds8f.y;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-

			
 
				-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it

			
 
				-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q8_0_Q8_1_MMVQ 2

			
 
				-#define VDR_Q8_0_Q8_1_MMQ 8

			
 
				-

			
 
				-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(

			
 
				-    const int * v, const int * u, const float & d8_0, const float & d8_1) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < vdr; ++i) {

			
 
				-        // SIMD dot product of quantized values

			
 
				-        sumi = __dp4a(v[i], u[i], sumi);

			
 
				-    }

			
 
				-

			
 
				-    return d8_0*d8_1 * sumi;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(

			
 
				-    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < vdr; ++i) {

			
 
				-        // SIMD dot product of quantized values

			
 
				-        sumi = __dp4a(v[i], u[i], sumi);

			
 
				-    }

			
 
				-

			
 
				-#ifdef GGML_CUDA_F16

			
 
				-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));

			
 
				-    const float d8d8 = tmp.x;

			
 
				-    const float m8s8 = tmp.y;

			
 
				-#else

			
 
				-    const float2 dm8f = __half22float2(dm8);

			
 
				-    const float2 ds8f = __half22float2(ds8);

			
 
				-    const float d8d8 = dm8f.x * ds8f.x;

			
 
				-    const float m8s8 = dm8f.y * ds8f.y;

			
 
				-#endif // GGML_CUDA_F16

			
 
				-

			
 
				-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it

			
 
				-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q2_K_Q8_1_MMVQ 1

			
 
				-#define VDR_Q2_K_Q8_1_MMQ  2

			
 
				-

			
 
				-// contiguous v/x values

			
 
				-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(

			
 
				-    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,

			
 
				-    const half2 & dm2, const float * __restrict__ d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf_d = 0.0f;

			
 
				-    float sumf_m = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR2_K; ++i) {

			
 
				-        const int sc = scales[2*i];

			
 
				-

			
 
				-        const int vi = (v >> (2*i)) & 0x03030303;

			
 
				-

			
 
				-        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product

			
 
				-

			
 
				-        // fill int with 4x m

			
 
				-        int m = sc >> 4;

			
 
				-        m |= m <<  8;

			
 
				-        m |= m << 16;

			
 
				-        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values

			
 
				-    }

			
 
				-

			
 
				-    const float2 dm2f = __half22float2(dm2);

			
 
				-

			
 
				-    return dm2f.x*sumf_d - dm2f.y*sumf_m;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-// contiguous u/y values

			
 
				-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(

			
 
				-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,

			
 
				-    const half2 & dm2, const float & d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi_d = 0;

			
 
				-    int sumi_m = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {

			
 
				-        int sumi_d_sc = 0;

			
 
				-

			
 
				-        const int sc = scales[i0 / (QI8_1/2)];

			
 
				-

			
 
				-        // fill int with 4x m

			
 
				-        int m = sc >> 4;

			
 
				-        m |= m <<  8;

			
 
				-        m |= m << 16;

			
 
				-

			
 
				-#pragma unroll

			
 
				-        for (int i = i0; i < i0 + QI8_1/2; ++i) {

			
 
				-            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product

			
 
				-            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m

			
 
				-        }

			
 
				-

			
 
				-        sumi_d += sumi_d_sc * (sc & 0xF);

			
 
				-    }

			
 
				-

			
 
				-    const float2 dm2f = __half22float2(dm2);

			
 
				-

			
 
				-    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q3_K_Q8_1_MMVQ 1

			
 
				-#define VDR_Q3_K_Q8_1_MMQ  2

			
 
				-

			
 
				-// contiguous v/x values

			
 
				-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(

			
 
				-    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,

			
 
				-    const int & scale_offset, const float & d3, const float * __restrict__ d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR3_K; ++i) {

			
 
				-        const int isc = scale_offset + 2*i;

			
 
				-

			
 
				-        const int isc_low = isc % (QK_K/32);

			
 
				-        const int sc_shift_low = 4 * (isc / (QK_K/32));

			
 
				-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;

			
 
				-

			
 
				-        const int isc_high = isc % (QK_K/64);

			
 
				-        const int sc_shift_high = 2 * (isc / (QK_K/64));

			
 
				-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;

			
 
				-

			
 
				-        const int sc = (sc_low | sc_high) - 32;

			
 
				-

			
 
				-        const int vil = (vl >> (2*i)) & 0x03030303;

			
 
				-

			
 
				-        const int vih = ((vh >> i) << 2) & 0x04040404;

			
 
				-

			
 
				-        const int vi = __vsubss4(vil, vih);

			
 
				-

			
 
				-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product

			
 
				-    }

			
 
				-

			
 
				-    return d3 * sumf;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-// contiguous u/y values

			
 
				-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(

			
 
				-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,

			
 
				-    const float & d3, const float & d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    int sumi = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {

			
 
				-        int sumi_sc = 0;

			
 
				-

			
 
				-        for (int i = i0; i < i0 + QI8_1/2; ++i) {

			
 
				-            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product

			
 
				-        }

			
 
				-

			
 
				-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];

			
 
				-    }

			
 
				-

			
 
				-    return d3*d8 * sumi;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q4_K_Q8_1_MMVQ 2

			
 
				-#define VDR_Q4_K_Q8_1_MMQ  8

			
 
				-

			
 
				-// contiguous v/x values

			
 
				-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(

			
 
				-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,

			
 
				-    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf_d = 0.0f;

			
 
				-    float sumf_m = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR4_K; ++i) {

			
 
				-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;

			
 
				-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;

			
 
				-

			
 
				-        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product

			
 
				-        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u

			
 
				-

			
 
				-        sumf_d += d8[i] * (dot1 * sc[i]);

			
 
				-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values

			
 
				-    }

			
 
				-

			
 
				-    const float2 dm4f = __half22float2(dm4);

			
 
				-

			
 
				-    return dm4f.x*sumf_d - dm4f.y*sumf_m;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-// contiguous u/y values

			
 
				-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(

			
 
				-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,

			
 
				-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf_d = 0.0f;

			
 
				-    float sumf_m = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {

			
 
				-        int sumi_d = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-        for (int j = 0; j < QI8_1; ++j) {

			
 
				-            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product

			
 
				-        }

			
 
				-

			
 
				-        const float2 ds8f = __half22float2(ds8[i]);

			
 
				-

			
 
				-        sumf_d += ds8f.x * (sc[i] * sumi_d);

			
 
				-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val

			
 
				-    }

			
 
				-

			
 
				-    const float2 dm4f = __half22float2(dm4);

			
 
				-

			
 
				-    return dm4f.x*sumf_d - dm4f.y*sumf_m;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q5_K_Q8_1_MMVQ 2

			
 
				-#define VDR_Q5_K_Q8_1_MMQ  8

			
 
				-

			
 
				-// contiguous v/x values

			
 
				-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(

			
 
				-    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,

			
 
				-    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf_d = 0.0f;

			
 
				-    float sumf_m = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR5_K; ++i) {

			
 
				-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;

			
 
				-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;

			
 
				-

			
 
				-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;

			
 
				-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;

			
 
				-

			
 
				-        const int v0i = vl0i | vh0i;

			
 
				-        const int v1i = vl1i | vh1i;

			
 
				-

			
 
				-        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product

			
 
				-        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u

			
 
				-

			
 
				-        sumf_d += d8[i] * (dot1 * sc[i]);

			
 
				-        sumf_m += d8[i] * (dot2 * m[i]);

			
 
				-

			
 
				-    }

			
 
				-

			
 
				-    const float2 dm5f = __half22float2(dm5);

			
 
				-

			
 
				-    return dm5f.x*sumf_d - dm5f.y*sumf_m;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-// contiguous u/y values

			
 
				-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(

			
 
				-    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,

			
 
				-    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf_d = 0.0f;

			
 
				-    float sumf_m = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {

			
 
				-        int sumi_d = 0;

			
 
				-

			
 
				-#pragma unroll

			
 
				-        for (int j = 0; j < QI8_1; ++j) {

			
 
				-            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product

			
 
				-        }

			
 
				-

			
 
				-        const float2 ds8f = __half22float2(ds8[i]);

			
 
				-

			
 
				-        sumf_d += ds8f.x * (sc[i] * sumi_d);

			
 
				-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val

			
 
				-    }

			
 
				-

			
 
				-    const float2 dm4f = __half22float2(dm4);

			
 
				-

			
 
				-    return dm4f.x*sumf_d - dm4f.y*sumf_m;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-#define VDR_Q6_K_Q8_1_MMVQ 1

			
 
				-#define VDR_Q6_K_Q8_1_MMQ  8

			
 
				-

			
 
				-// contiguous v/x values

			
 
				-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(

			
 
				-    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,

			
 
				-    const float & d, const float * __restrict__ d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR6_K; ++i) {

			
 
				-        const int sc = scales[4*i];

			
 
				-

			
 
				-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;

			
 
				-

			
 
				-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;

			
 
				-

			
 
				-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32

			
 
				-

			
 
				-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product

			
 
				-    }

			
 
				-

			
 
				-    return d*sumf;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-// contiguous u/y values

			
 
				-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(

			
 
				-    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,

			
 
				-    const float & d6, const float * __restrict__ d8) {

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    float sumf_d = 0.0f;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {

			
 
				-        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale

			
 
				-

			
 
				-#pragma unroll

			
 
				-        for (int i = i0; i < i0 + 2; ++i) {

			
 
				-            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product

			
 
				-            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product

			
 
				-

			
 
				-            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product

			
 
				-            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product

			
 
				-        }

			
 
				-

			
 
				-        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);

			
 
				-    }

			
 
				-

			
 
				-    return d6 * sumf_d;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q4_0_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;

			
 
				-

			
 
				-    int v[VDR_Q4_0_Q8_1_MMVQ];

			
 
				-    int u[2*VDR_Q4_0_Q8_1_MMVQ];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {

			
 
				-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);

			
 
				-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);

			
 
				-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);

			
 
				-}

			
 
				-

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;

			
 
				-

			
 
				-    int v[VDR_Q4_1_Q8_1_MMVQ];

			
 
				-    int u[2*VDR_Q4_1_Q8_1_MMVQ];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {

			
 
				-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);

			
 
				-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);

			
 
				-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q5_0_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;

			
 
				-

			
 
				-    int vl[VDR_Q5_0_Q8_1_MMVQ];

			
 
				-    int vh[VDR_Q5_0_Q8_1_MMVQ];

			
 
				-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {

			
 
				-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);

			
 
				-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));

			
 
				-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);

			
 
				-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;

			
 
				-

			
 
				-    int vl[VDR_Q5_1_Q8_1_MMVQ];

			
 
				-    int vh[VDR_Q5_1_Q8_1_MMVQ];

			
 
				-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {

			
 
				-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);

			
 
				-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));

			
 
				-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);

			
 
				-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;

			
 
				-

			
 
				-    int v[VDR_Q8_0_Q8_1_MMVQ];

			
 
				-    int u[VDR_Q8_0_Q8_1_MMVQ];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {

			
 
				-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);

			
 
				-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q2_K_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;

			
 
				-

			
 
				-    const int bq8_offset = QR2_K * (iqs / QI8_1);

			
 
				-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

			
 
				-

			
 
				-    const uint8_t * scales = bq2_K->scales + scale_offset;

			
 
				-

			
 
				-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);

			
 
				-    int    u[QR2_K];

			
 
				-    float d8[QR2_K];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR2_K; ++ i) {

			
 
				-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);

			
 
				-        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q3_K_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;

			
 
				-

			
 
				-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));

			
 
				-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);

			
 
				-

			
 
				-    const float d = bq3_K->d;

			
 
				-

			
 
				-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);

			
 
				-

			
 
				-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted

			
 
				-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;

			
 
				-

			
 
				-    int    u[QR3_K];

			
 
				-    float d8[QR3_K];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR3_K; ++i) {

			
 
				-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);

			
 
				-        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q4_K_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-#ifndef GGML_QKK_64

			
 
				-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

			
 
				-

			
 
				-    int    v[2];

			
 
				-    int    u[2*QR4_K];

			
 
				-    float d8[QR4_K];

			
 
				-

			
 
				-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6

			
 
				-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));

			
 
				-

			
 
				-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12

			
 
				-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44

			
 
				-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76

			
 
				-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108

			
 
				-

			
 
				-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));

			
 
				-    v[0] = q4[0];

			
 
				-    v[1] = q4[4];

			
 
				-

			
 
				-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;

			
 
				-    uint16_t aux[2];

			
 
				-    const int j = bq8_offset/2;

			
 
				-    if (j < 2) {

			
 
				-        aux[0] = scales[j+0] & 0x3f3f;

			
 
				-        aux[1] = scales[j+2] & 0x3f3f;

			
 
				-    } else {

			
 
				-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);

			
 
				-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);

			
 
				-    }

			
 
				-    const uint8_t * sc = (const uint8_t *)aux;

			
 
				-    const uint8_t * m  = sc + 2;

			
 
				-

			
 
				-    for (int i = 0; i < QR4_K; ++i) {

			
 
				-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;

			
 
				-        d8[i] = __low2float(bq8i->ds);

			
 
				-

			
 
				-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);

			
 
				-        u[2*i+0] = q8[0];

			
 
				-        u[2*i+1] = q8[4];

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);

			
 
				-

			
 
				-#else

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;

			
 
				-

			
 
				-    float sumf_d = 0.0f;

			
 
				-    float sumf_m = 0.0f;

			
 
				-

			
 
				-    uint16_t aux16[2];

			
 
				-    const uint8_t * s = (const uint8_t *)aux16;

			
 
				-

			
 
				-    const uint16_t * a = (const uint16_t *)bq4_K->scales;

			
 
				-    aux16[0] = a[0] & 0x0f0f;

			
 
				-    aux16[1] = (a[0] >> 4) & 0x0f0f;

			
 
				-

			
 
				-    const float dall = bq4_K->dm[0];

			
 
				-    const float dmin = bq4_K->dm[1];

			
 
				-

			
 
				-    const float d8_1 = __low2float(bq8_1[0].ds);

			
 
				-    const float d8_2 = __low2float(bq8_1[1].ds);

			
 
				-

			
 
				-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));

			
 
				-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);

			
 
				-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));

			
 
				-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);

			
 
				-

			
 
				-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);

			
 
				-    const int v1 = q4[0];

			
 
				-    const int v2 = q4[4];

			
 
				-

			
 
				-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));

			
 
				-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));

			
 
				-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));

			
 
				-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));

			
 
				-

			
 
				-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);

			
 
				-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);

			
 
				-

			
 
				-    return dall * sumf_d - dmin * sumf_m;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q5_K_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-#ifndef GGML_QKK_64

			
 
				-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

			
 
				-

			
 
				-    int   vl[2];

			
 
				-    int   vh[2];

			
 
				-    int    u[2*QR5_K];

			
 
				-    float d8[QR5_K];

			
 
				-

			
 
				-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));

			
 
				-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));

			
 
				-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));

			
 
				-

			
 
				-    vl[0] = ql[0];

			
 
				-    vl[1] = ql[4];

			
 
				-

			
 
				-    vh[0] = qh[0] >> bq8_offset;

			
 
				-    vh[1] = qh[4] >> bq8_offset;

			
 
				-

			
 
				-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;

			
 
				-    uint16_t aux[2];

			
 
				-    const int j = bq8_offset/2;

			
 
				-    if (j < 2) {

			
 
				-        aux[0] = scales[j+0] & 0x3f3f;

			
 
				-        aux[1] = scales[j+2] & 0x3f3f;

			
 
				-    } else {

			
 
				-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);

			
 
				-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);

			
 
				-    }

			
 
				-    const uint8_t * sc = (const uint8_t *)aux;

			
 
				-    const uint8_t * m  = sc + 2;

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR5_K; ++i) {

			
 
				-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;

			
 
				-        d8[i] = __low2float(bq8i->ds);

			
 
				-

			
 
				-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);

			
 
				-        u[2*i+0] = q8[0];

			
 
				-        u[2*i+1] = q8[4];

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);

			
 
				-

			
 
				-#else

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;

			
 
				-

			
 
				-    const int8_t * s = bq5_K->scales;

			
 
				-

			
 
				-    const float d = bq5_K->d;

			
 
				-

			
 
				-    const float d8_1 = __low2half(bq8_1[0].ds);

			
 
				-    const float d8_2 = __low2half(bq8_1[1].ds);

			
 
				-

			
 
				-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));

			
 
				-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);

			
 
				-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));

			
 
				-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);

			
 
				-

			
 
				-    const int * ql = (const int *)bq5_K->qs + (iqs/2);

			
 
				-    const int vl1 = ql[0];

			
 
				-    const int vl2 = ql[4];

			
 
				-

			
 
				-    const int step = 4 * (iqs/2); // 0, 4, 8, 12

			
 
				-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6

			
 
				-    const int in = step%8; // 0, 4, 0, 4

			
 
				-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;

			
 
				-

			
 
				-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);

			
 
				-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);

			
 
				-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);

			
 
				-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);

			
 
				-

			
 
				-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])

			
 
				-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);

			
 
				-

			
 
				-    return d * sumf_d;

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A

			
 
				-

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_q6_K_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;

			
 
				-

			
 
				-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);

			
 
				-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);

			
 
				-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));

			
 
				-

			
 
				-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);

			
 
				-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;

			
 
				-

			
 
				-    const int8_t * scales = bq6_K->scales + scale_offset;

			
 
				-

			
 
				-    int    u[QR6_K];

			
 
				-    float d8[QR6_K];

			
 
				-

			
 
				-#pragma unroll

			
 
				-    for (int i = 0; i < QR6_K; ++i) {

			
 
				-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);

			
 
				-        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);

			
 
				-    }

			
 
				-

			
 
				-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if QK_K == 256

			
 
				-    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;

			
 
				-

			
 
				-#if QR2_XXS == 8

			
 
				-    const int ib32 = iqs;

			
 
				-    const uint16_t * q2 = bq2->qs + 4*ib32;

			
 
				-    const uint8_t  * aux8 = (const uint8_t *)q2;

			
 
				-    const int8_t   * q8 = bq8_1[ib32].qs;

			
 
				-    uint32_t aux32 = q2[2] | (q2[3] << 16);

			
 
				-    int sumi = 0;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);

			
 
				-        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];

			
 
				-        for (int j = 0; j < 8; ++j) {

			
 
				-            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);

			
 
				-        }

			
 
				-        q8 += 8;

			
 
				-        aux32 >>= 7;

			
 
				-    }

			
 
				-    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;

			
 
				-    return d * sumi;

			
 
				-#else

			
 
				-    // iqs is 0...15

			
 
				-    const int ib32 = iqs/2;

			
 
				-    const int il = iqs%2;

			
 
				-    const uint16_t * q2 = bq2->qs + 4*ib32;

			
 
				-    const uint8_t  * aux8 = (const uint8_t *)q2;

			
 
				-    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);

			
 
				-    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);

			
 
				-    const uint32_t aux32 = q2[2] | (q2[3] << 16);

			
 
				-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;

			
 
				-    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];

			
 
				-    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];

			
 
				-    const int8_t * q8 = bq8_1[ib32].qs + 16*il;

			
 
				-    int sumi1 = 0, sumi2 = 0;

			
 
				-    for (int j = 0; j < 8; ++j) {

			
 
				-        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);

			
 
				-        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);

			
 
				-    }

			
 
				-    return d * (sumi1 + sumi2);

			
 
				-#endif

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-#if QK_K == 256

			
 
				-    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

			
 
				-

			
 
				-    const int ib32 = iqs;

			
 
				-    const uint16_t * q2 = bq2->qs + 4*ib32;

			
 
				-    const int8_t   * q8 = bq8_1[ib32].qs;

			
 
				-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;

			
 
				-    const uint8_t ls2 = bq2->scales[ib32] >>  4;

			
 
				-    int sumi1 = 0;

			
 
				-    for (int l = 0; l < 2; ++l) {

			
 
				-        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));

			
 
				-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));

			
 
				-        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);

			
 
				-        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);

			
 
				-        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);

			
 
				-        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-    int sumi2 = 0;

			
 
				-    for (int l = 2; l < 4; ++l) {

			
 
				-        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));

			
 
				-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));

			
 
				-        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);

			
 
				-        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);

			
 
				-        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);

			
 
				-        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;

			
 
				-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);

			
 
				-#else

			
 
				-    GGML_UNUSED(ksigns64);

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-#else

			
 
				-    GGML_UNUSED(ksigns64);

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-// TODO

			
 
				-static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-#if QK_K == 256

			
 
				-    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;

			
 
				-

			
 
				-    const int ib32 = iqs;

			
 
				-    const int8_t  * q8 = bq8_1[ib32].qs;

			
 
				-    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;

			
 
				-    const uint8_t ls1 = bq2->scales[ib32] & 0xf;

			
 
				-    const uint8_t ls2 = bq2->scales[ib32] >>  4;

			
 
				-    int sumi1 = 0;

			
 
				-    for (int l = 0; l < 2; ++l) {

			
 
				-        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));

			
 
				-        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);

			
 
				-        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);

			
 
				-        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);

			
 
				-        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);

			
 
				-        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);

			
 
				-        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-    int sumi2 = 0;

			
 
				-    for (int l = 2; l < 4; ++l) {

			
 
				-        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));

			
 
				-        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);

			
 
				-        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);

			
 
				-        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);

			
 
				-        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);

			
 
				-        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);

			
 
				-        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;

			
 
				-    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);

			
 
				-#else

			
 
				-    GGML_UNUSED(ksigns64);

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-#else

			
 
				-    GGML_UNUSED(ksigns64);

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-#if QK_K == 256

			
 
				-    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;

			
 
				-

			
 
				-    const int ib32 = iqs;

			
 
				-    const uint8_t  * q3 = bq2->qs + 8*ib32;

			
 
				-    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;

			
 
				-    const int8_t   * q8 = bq8_1[ib32].qs;

			
 
				-    uint32_t aux32 = gas[0] | (gas[1] << 16);

			
 
				-    int sumi = 0;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];

			
 
				-        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];

			
 
				-        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));

			
 
				-        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);

			
 
				-        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);

			
 
				-        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);

			
 
				-        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);

			
 
				-        q8 += 8;

			
 
				-        aux32 >>= 7;

			
 
				-    }

			
 
				-    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;

			
 
				-    return d * sumi;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-// TODO: don't use lookup table for signs

			
 
				-static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-#if QK_K == 256

			
 
				-    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;

			
 
				-

			
 
				-    const int ib32 = iqs;

			
 
				-    const uint8_t  * qs = bq2->qs + 8*ib32;

			
 
				-    const int8_t   * q8 = bq8_1[ib32].qs;

			
 
				-    int sumi = 0;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));

			
 
				-        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));

			
 
				-        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);

			
 
				-        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);

			
 
				-        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);

			
 
				-        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);

			
 
				-        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);

			
 
				-        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-    const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);

			
 
				-    return d * sumi;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if QK_K == 256

			
 
				-    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

			
 
				-

			
 
				-    const int ib32 = iqs;

			
 
				-    int sumi = 0;

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    const int * q8 = (const int *)bq8_1[ib32].qs;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));

			
 
				-        int grid0 = grid[0] & 0x0f0f0f0f;

			
 
				-        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;

			
 
				-        sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));

			
 
				-    }

			
 
				-#else

			
 
				-    const int8_t * q8 = bq8_1[ib32].qs;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));

			
 
				-        for (int j = 0; j < 4; ++j) {

			
 
				-            sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);

			
 
				-        }

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-#endif

			
 
				-    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;

			
 
				-    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);

			
 
				-    const float d = d1q * __low2float (bq8_1[ib32].ds);

			
 
				-    const float m = d1q * __high2float(bq8_1[ib32].ds);

			
 
				-    return d * sumi + m * delta;

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-#if QK_K == 256

			
 
				-    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;

			
 
				-

			
 
				-    const int ib32 = iqs;

			
 
				-    int   sumi[2] = {0, 0};

			
 
				-    float sumf[2] = {0.f, 0.f};

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    const int * q8 = (const int *)bq8_1[ib32].qs;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));

			
 
				-        int grid0 = grid[0] & 0x0f0f0f0f;

			
 
				-        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;

			
 
				-        sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2]));

			
 
				-        const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;

			
 
				-        const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0));

			
 
				-        sumf[l/2] += delta*sumy;

			
 
				-    }

			
 
				-#else

			
 
				-    const int8_t * q8 = bq8_1[ib32].qs;

			
 
				-    for (int l = 0; l < 4; ++l) {

			
 
				-        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));

			
 
				-        int sumy = 0;

			
 
				-        for (int j = 0; j < 4; ++j) {

			
 
				-            sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);

			
 
				-            sumy += q8[j] + q8[j+4];

			
 
				-        }

			
 
				-        const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;

			
 
				-        sumf[l/2] += delta*sumy;

			
 
				-        q8 += 8;

			
 
				-    }

			
 
				-#endif

			
 
				-    iq1m_scale_t scale;

			
 
				-    const uint16_t * sc = (const uint16_t *)bq1->scales;

			
 
				-    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);

			
 
				-    const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);

			
 
				-    return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,

			
 
				-        int & val1, int & val2) {

			
 
				-

			
 
				-    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;

			
 
				-    aux32 = q4 & 0x0f0f0f0f;

			
 
				-    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);

			
 
				-    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);

			
 
				-    val1 = v1 | (v2 << 16);

			
 
				-    aux32 = (q4 >> 4) & 0x0f0f0f0f;

			
 
				-    v1 = values[q8[0]] | (values[q8[1]] << 8);

			
 
				-    v2 = values[q8[2]] | (values[q8[3]] << 8);

			
 
				-    val2 = v1 | (v2 << 16);

			
 
				-}

			
 
				-#endif

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;

			
 
				-

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;

			
 
				-    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;

			
 
				-

			
 
				-    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

			
 
				-

			
 
				-    int v1, v2;

			
 
				-    int sumi1 = 0, sumi2 = 0;

			
 
				-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {

			
 
				-        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);

			
 
				-        get_int_from_table_16(aux, values, v1, v2);

			
 
				-        sumi1 = __dp4a(v1, q8[l+0], sumi1);

			
 
				-        sumi2 = __dp4a(v2, q8[l+4], sumi2);

			
 
				-    }

			
 
				-

			
 
				-#else

			
 
				-    const uint8_t * q4 = bq->qs + 4*iqs;

			
 
				-    const int8_t  * q8 = bq8_1->qs + 4*iqs;

			
 
				-

			
 
				-    int sumi1 = 0, sumi2 = 0;

			
 
				-    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {

			
 
				-        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];

			
 
				-        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];

			
 
				-    }

			
 
				-#endif

			
 
				-    const float d = (float)bq->d * __low2float(bq8_1->ds);

			
 
				-    return d * (sumi1 + sumi2);

			
 
				-}

			
 
				-

			
 
				-static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(

			
 
				-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {

			
 
				-

			
 
				-#if QK_K == 256

			
 
				-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics

			
 
				-

			
 
				-    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;

			
 
				-    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;

			
 
				-

			
 
				-    // iqs is 0...7

			
 
				-    const int ib32 = iqs;

			
 
				-    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;

			
 
				-    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;

			
 
				-    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);

			
 
				-    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);

			
 
				-    int v1, v2;

			
 
				-    int sumi1 = 0, sumi2 = 0;

			
 
				-    for (int j = 0; j < 4; ++j) {

			
 
				-        get_int_from_table_16(q4[j], values, v1, v2);

			
 
				-        sumi1 = __dp4a(v1, q8[j+0], sumi1);

			
 
				-        sumi2 = __dp4a(v2, q8[j+4], sumi2);

			
 
				-    }

			
 
				-    return d * (sumi1 + sumi2);

			
 
				-

			
 
				-#else

			
 
				-    NO_DEVICE_CODE;

			
 
				-#endif

			
 
				-#else

			
 
				-    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);

			
 
				-#endif

			
 
				-}

			
 
				+#include "common.cuh"
			
 
				+
			
 
				+static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
			
 
				+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
			
 
				+
			
 
				+    int x32 = 0;
			
 
				+    x32 |= x16[0] <<  0;
			
 
				+    x32 |= x16[1] << 16;
			
 
				+
			
 
				+    return x32;
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
			
 
				+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
			
 
				+
			
 
				+    int x32 = 0;
			
 
				+    x32 |= x16[0] <<  0;
			
 
				+    x32 |= x16[1] << 16;
			
 
				+
			
 
				+    return x32;
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
			
 
				+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
			
 
				+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
			
 
				+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
			
 
				+
			
 
				+#define VDR_Q4_0_Q8_1_MMVQ 2
			
 
				+#define VDR_Q4_0_Q8_1_MMQ  4
			
 
				+
			
 
				+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
			
 
				+    const int * v, const int * u, const float & d4, const half2 & ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < vdr; ++i) {
			
 
				+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
			
 
				+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
			
 
				+
			
 
				+        // SIMD dot product of quantized values
			
 
				+        sumi = __dp4a(vi0, u[2*i+0], sumi);
			
 
				+        sumi = __dp4a(vi1, u[2*i+1], sumi);
			
 
				+    }
			
 
				+
			
 
				+    const float2 ds8f = __half22float2(ds8);
			
 
				+
			
 
				+    // second part effectively subtracts 8 from each quant value
			
 
				+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q4_1_Q8_1_MMVQ 2
			
 
				+#define VDR_Q4_1_Q8_1_MMQ  4
			
 
				+
			
 
				+template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
			
 
				+    const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < vdr; ++i) {
			
 
				+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
			
 
				+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
			
 
				+
			
 
				+        // SIMD dot product of quantized values
			
 
				+        sumi = __dp4a(vi0, u[2*i+0], sumi);
			
 
				+        sumi = __dp4a(vi1, u[2*i+1], sumi);
			
 
				+    }
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
			
 
				+    const float d4d8 = tmp.x;
			
 
				+    const float m4s8 = tmp.y;
			
 
				+#else
			
 
				+    const float2 dm4f = __half22float2(dm4);
			
 
				+    const float2 ds8f = __half22float2(ds8);
			
 
				+    const float d4d8 = dm4f.x * ds8f.x;
			
 
				+    const float m4s8 = dm4f.y * ds8f.y;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+
			
 
				+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
			
 
				+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q5_0_Q8_1_MMVQ 2
			
 
				+#define VDR_Q5_0_Q8_1_MMQ  4
			
 
				+
			
 
				+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
			
 
				+    const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < vdr; ++i) {
			
 
				+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
			
 
				+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
			
 
				+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
			
 
				+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
			
 
				+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
			
 
				+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
			
 
				+
			
 
				+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
			
 
				+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
			
 
				+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
			
 
				+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
			
 
				+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
			
 
				+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
			
 
				+    }
			
 
				+
			
 
				+    const float2 ds8f = __half22float2(ds8);
			
 
				+
			
 
				+    // second part effectively subtracts 16 from each quant value
			
 
				+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q5_1_Q8_1_MMVQ 2
			
 
				+#define VDR_Q5_1_Q8_1_MMQ  4
			
 
				+
			
 
				+template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
			
 
				+    const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < vdr; ++i) {
			
 
				+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
			
 
				+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
			
 
				+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
			
 
				+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
			
 
				+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
			
 
				+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
			
 
				+
			
 
				+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
			
 
				+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
			
 
				+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
			
 
				+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
			
 
				+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
			
 
				+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
			
 
				+    }
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
			
 
				+    const float d5d8 = tmp.x;
			
 
				+    const float m5s8 = tmp.y;
			
 
				+#else
			
 
				+    const float2 dm5f = __half22float2(dm5);
			
 
				+    const float2 ds8f = __half22float2(ds8);
			
 
				+    const float d5d8 = dm5f.x * ds8f.x;
			
 
				+    const float m5s8 = dm5f.y * ds8f.y;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+
			
 
				+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
			
 
				+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q8_0_Q8_1_MMVQ 2
			
 
				+#define VDR_Q8_0_Q8_1_MMQ 8
			
 
				+
			
 
				+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
			
 
				+    const int * v, const int * u, const float & d8_0, const float & d8_1) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < vdr; ++i) {
			
 
				+        // SIMD dot product of quantized values
			
 
				+        sumi = __dp4a(v[i], u[i], sumi);
			
 
				+    }
			
 
				+
			
 
				+    return d8_0*d8_1 * sumi;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
			
 
				+    const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < vdr; ++i) {
			
 
				+        // SIMD dot product of quantized values
			
 
				+        sumi = __dp4a(v[i], u[i], sumi);
			
 
				+    }
			
 
				+
			
 
				+#ifdef GGML_CUDA_F16
			
 
				+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
			
 
				+    const float d8d8 = tmp.x;
			
 
				+    const float m8s8 = tmp.y;
			
 
				+#else
			
 
				+    const float2 dm8f = __half22float2(dm8);
			
 
				+    const float2 ds8f = __half22float2(ds8);
			
 
				+    const float d8d8 = dm8f.x * ds8f.x;
			
 
				+    const float m8s8 = dm8f.y * ds8f.y;
			
 
				+#endif // GGML_CUDA_F16
			
 
				+
			
 
				+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
			
 
				+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q2_K_Q8_1_MMVQ 1
			
 
				+#define VDR_Q2_K_Q8_1_MMQ  2
			
 
				+
			
 
				+// contiguous v/x values
			
 
				+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
			
 
				+    const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
			
 
				+    const half2 & dm2, const float * __restrict__ d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf_d = 0.0f;
			
 
				+    float sumf_m = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR2_K; ++i) {
			
 
				+        const int sc = scales[2*i];
			
 
				+
			
 
				+        const int vi = (v >> (2*i)) & 0x03030303;
			
 
				+
			
 
				+        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
			
 
				+
			
 
				+        // fill int with 4x m
			
 
				+        int m = sc >> 4;
			
 
				+        m |= m <<  8;
			
 
				+        m |= m << 16;
			
 
				+        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
			
 
				+    }
			
 
				+
			
 
				+    const float2 dm2f = __half22float2(dm2);
			
 
				+
			
 
				+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+// contiguous u/y values
			
 
				+static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
			
 
				+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
			
 
				+    const half2 & dm2, const float & d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi_d = 0;
			
 
				+    int sumi_m = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
			
 
				+        int sumi_d_sc = 0;
			
 
				+
			
 
				+        const int sc = scales[i0 / (QI8_1/2)];
			
 
				+
			
 
				+        // fill int with 4x m
			
 
				+        int m = sc >> 4;
			
 
				+        m |= m <<  8;
			
 
				+        m |= m << 16;
			
 
				+
			
 
				+#pragma unroll
			
 
				+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
			
 
				+            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
			
 
				+            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
			
 
				+        }
			
 
				+
			
 
				+        sumi_d += sumi_d_sc * (sc & 0xF);
			
 
				+    }
			
 
				+
			
 
				+    const float2 dm2f = __half22float2(dm2);
			
 
				+
			
 
				+    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q3_K_Q8_1_MMVQ 1
			
 
				+#define VDR_Q3_K_Q8_1_MMQ  2
			
 
				+
			
 
				+// contiguous v/x values
			
 
				+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
			
 
				+    const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
			
 
				+    const int & scale_offset, const float & d3, const float * __restrict__ d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR3_K; ++i) {
			
 
				+        const int isc = scale_offset + 2*i;
			
 
				+
			
 
				+        const int isc_low = isc % (QK_K/32);
			
 
				+        const int sc_shift_low = 4 * (isc / (QK_K/32));
			
 
				+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
			
 
				+
			
 
				+        const int isc_high = isc % (QK_K/64);
			
 
				+        const int sc_shift_high = 2 * (isc / (QK_K/64));
			
 
				+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
			
 
				+
			
 
				+        const int sc = (sc_low | sc_high) - 32;
			
 
				+
			
 
				+        const int vil = (vl >> (2*i)) & 0x03030303;
			
 
				+
			
 
				+        const int vih = ((vh >> i) << 2) & 0x04040404;
			
 
				+
			
 
				+        const int vi = __vsubss4(vil, vih);
			
 
				+
			
 
				+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
			
 
				+    }
			
 
				+
			
 
				+    return d3 * sumf;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+// contiguous u/y values
			
 
				+static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
			
 
				+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
			
 
				+    const float & d3, const float & d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    int sumi = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
			
 
				+        int sumi_sc = 0;
			
 
				+
			
 
				+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
			
 
				+            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
			
 
				+        }
			
 
				+
			
 
				+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
			
 
				+    }
			
 
				+
			
 
				+    return d3*d8 * sumi;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q4_K_Q8_1_MMVQ 2
			
 
				+#define VDR_Q4_K_Q8_1_MMQ  8
			
 
				+
			
 
				+// contiguous v/x values
			
 
				+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
			
 
				+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
			
 
				+    const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf_d = 0.0f;
			
 
				+    float sumf_m = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR4_K; ++i) {
			
 
				+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
			
 
				+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
			
 
				+
			
 
				+        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
			
 
				+        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
			
 
				+
			
 
				+        sumf_d += d8[i] * (dot1 * sc[i]);
			
 
				+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
			
 
				+    }
			
 
				+
			
 
				+    const float2 dm4f = __half22float2(dm4);
			
 
				+
			
 
				+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+// contiguous u/y values
			
 
				+static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
			
 
				+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
			
 
				+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf_d = 0.0f;
			
 
				+    float sumf_m = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
			
 
				+        int sumi_d = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+        for (int j = 0; j < QI8_1; ++j) {
			
 
				+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
			
 
				+        }
			
 
				+
			
 
				+        const float2 ds8f = __half22float2(ds8[i]);
			
 
				+
			
 
				+        sumf_d += ds8f.x * (sc[i] * sumi_d);
			
 
				+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
			
 
				+    }
			
 
				+
			
 
				+    const float2 dm4f = __half22float2(dm4);
			
 
				+
			
 
				+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q5_K_Q8_1_MMVQ 2
			
 
				+#define VDR_Q5_K_Q8_1_MMQ  8
			
 
				+
			
 
				+// contiguous v/x values
			
 
				+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
			
 
				+    const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
			
 
				+    const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf_d = 0.0f;
			
 
				+    float sumf_m = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR5_K; ++i) {
			
 
				+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
			
 
				+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
			
 
				+
			
 
				+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
			
 
				+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
			
 
				+
			
 
				+        const int v0i = vl0i | vh0i;
			
 
				+        const int v1i = vl1i | vh1i;
			
 
				+
			
 
				+        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
			
 
				+        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
			
 
				+
			
 
				+        sumf_d += d8[i] * (dot1 * sc[i]);
			
 
				+        sumf_m += d8[i] * (dot2 * m[i]);
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    const float2 dm5f = __half22float2(dm5);
			
 
				+
			
 
				+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+// contiguous u/y values
			
 
				+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
			
 
				+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
			
 
				+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf_d = 0.0f;
			
 
				+    float sumf_m = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
			
 
				+        int sumi_d = 0;
			
 
				+
			
 
				+#pragma unroll
			
 
				+        for (int j = 0; j < QI8_1; ++j) {
			
 
				+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
			
 
				+        }
			
 
				+
			
 
				+        const float2 ds8f = __half22float2(ds8[i]);
			
 
				+
			
 
				+        sumf_d += ds8f.x * (sc[i] * sumi_d);
			
 
				+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
			
 
				+    }
			
 
				+
			
 
				+    const float2 dm4f = __half22float2(dm4);
			
 
				+
			
 
				+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+#define VDR_Q6_K_Q8_1_MMVQ 1
			
 
				+#define VDR_Q6_K_Q8_1_MMQ  8
			
 
				+
			
 
				+// contiguous v/x values
			
 
				+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
			
 
				+    const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
			
 
				+    const float & d, const float * __restrict__ d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR6_K; ++i) {
			
 
				+        const int sc = scales[4*i];
			
 
				+
			
 
				+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
			
 
				+
			
 
				+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
			
 
				+
			
 
				+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
			
 
				+
			
 
				+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
			
 
				+    }
			
 
				+
			
 
				+    return d*sumf;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+// contiguous u/y values
			
 
				+static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
			
 
				+    const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
			
 
				+    const float & d6, const float * __restrict__ d8) {
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    float sumf_d = 0.0f;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
			
 
				+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
			
 
				+
			
 
				+#pragma unroll
			
 
				+        for (int i = i0; i < i0 + 2; ++i) {
			
 
				+            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
			
 
				+            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
			
 
				+
			
 
				+            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
			
 
				+            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
			
 
				+        }
			
 
				+
			
 
				+        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
			
 
				+    }
			
 
				+
			
 
				+    return d6 * sumf_d;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
			
 
				+
			
 
				+    int v[VDR_Q4_0_Q8_1_MMVQ];
			
 
				+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
			
 
				+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
			
 
				+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
			
 
				+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
			
 
				+
			
 
				+    int v[VDR_Q4_1_Q8_1_MMVQ];
			
 
				+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
			
 
				+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
			
 
				+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
			
 
				+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
			
 
				+
			
 
				+    int vl[VDR_Q5_0_Q8_1_MMVQ];
			
 
				+    int vh[VDR_Q5_0_Q8_1_MMVQ];
			
 
				+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
			
 
				+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
			
 
				+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
			
 
				+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
			
 
				+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
			
 
				+
			
 
				+    int vl[VDR_Q5_1_Q8_1_MMVQ];
			
 
				+    int vh[VDR_Q5_1_Q8_1_MMVQ];
			
 
				+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
			
 
				+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
			
 
				+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
			
 
				+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
			
 
				+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
			
 
				+
			
 
				+    int v[VDR_Q8_0_Q8_1_MMVQ];
			
 
				+    int u[VDR_Q8_0_Q8_1_MMVQ];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
			
 
				+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
			
 
				+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
			
 
				+
			
 
				+    const int bq8_offset = QR2_K * (iqs / QI8_1);
			
 
				+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
			
 
				+
			
 
				+    const uint8_t * scales = bq2_K->scales + scale_offset;
			
 
				+
			
 
				+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
			
 
				+    int    u[QR2_K];
			
 
				+    float d8[QR2_K];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR2_K; ++ i) {
			
 
				+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
			
 
				+        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
			
 
				+
			
 
				+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
			
 
				+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
			
 
				+
			
 
				+    const float d = bq3_K->d;
			
 
				+
			
 
				+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
			
 
				+
			
 
				+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
			
 
				+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
			
 
				+
			
 
				+    int    u[QR3_K];
			
 
				+    float d8[QR3_K];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR3_K; ++i) {
			
 
				+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
			
 
				+        d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+#ifndef GGML_QKK_64
			
 
				+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
			
 
				+
			
 
				+    int    v[2];
			
 
				+    int    u[2*QR4_K];
			
 
				+    float d8[QR4_K];
			
 
				+
			
 
				+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
			
 
				+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
			
 
				+
			
 
				+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
			
 
				+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
			
 
				+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
			
 
				+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
			
 
				+
			
 
				+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
			
 
				+    v[0] = q4[0];
			
 
				+    v[1] = q4[4];
			
 
				+
			
 
				+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
			
 
				+    uint16_t aux[2];
			
 
				+    const int j = bq8_offset/2;
			
 
				+    if (j < 2) {
			
 
				+        aux[0] = scales[j+0] & 0x3f3f;
			
 
				+        aux[1] = scales[j+2] & 0x3f3f;
			
 
				+    } else {
			
 
				+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
			
 
				+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
			
 
				+    }
			
 
				+    const uint8_t * sc = (const uint8_t *)aux;
			
 
				+    const uint8_t * m  = sc + 2;
			
 
				+
			
 
				+    for (int i = 0; i < QR4_K; ++i) {
			
 
				+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
			
 
				+        d8[i] = __low2float(bq8i->ds);
			
 
				+
			
 
				+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
			
 
				+        u[2*i+0] = q8[0];
			
 
				+        u[2*i+1] = q8[4];
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
			
 
				+
			
 
				+    float sumf_d = 0.0f;
			
 
				+    float sumf_m = 0.0f;
			
 
				+
			
 
				+    uint16_t aux16[2];
			
 
				+    const uint8_t * s = (const uint8_t *)aux16;
			
 
				+
			
 
				+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
			
 
				+    aux16[0] = a[0] & 0x0f0f;
			
 
				+    aux16[1] = (a[0] >> 4) & 0x0f0f;
			
 
				+
			
 
				+    const float dall = bq4_K->dm[0];
			
 
				+    const float dmin = bq4_K->dm[1];
			
 
				+
			
 
				+    const float d8_1 = __low2float(bq8_1[0].ds);
			
 
				+    const float d8_2 = __low2float(bq8_1[1].ds);
			
 
				+
			
 
				+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
			
 
				+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
			
 
				+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
			
 
				+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
			
 
				+
			
 
				+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
			
 
				+    const int v1 = q4[0];
			
 
				+    const int v2 = q4[4];
			
 
				+
			
 
				+    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
			
 
				+    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
			
 
				+    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
			
 
				+    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
			
 
				+
			
 
				+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
			
 
				+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
			
 
				+
			
 
				+    return dall * sumf_d - dmin * sumf_m;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+#ifndef GGML_QKK_64
			
 
				+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
			
 
				+
			
 
				+    int   vl[2];
			
 
				+    int   vh[2];
			
 
				+    int    u[2*QR5_K];
			
 
				+    float d8[QR5_K];
			
 
				+
			
 
				+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
			
 
				+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
			
 
				+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
			
 
				+
			
 
				+    vl[0] = ql[0];
			
 
				+    vl[1] = ql[4];
			
 
				+
			
 
				+    vh[0] = qh[0] >> bq8_offset;
			
 
				+    vh[1] = qh[4] >> bq8_offset;
			
 
				+
			
 
				+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
			
 
				+    uint16_t aux[2];
			
 
				+    const int j = bq8_offset/2;
			
 
				+    if (j < 2) {
			
 
				+        aux[0] = scales[j+0] & 0x3f3f;
			
 
				+        aux[1] = scales[j+2] & 0x3f3f;
			
 
				+    } else {
			
 
				+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
			
 
				+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
			
 
				+    }
			
 
				+    const uint8_t * sc = (const uint8_t *)aux;
			
 
				+    const uint8_t * m  = sc + 2;
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR5_K; ++i) {
			
 
				+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
			
 
				+        d8[i] = __low2float(bq8i->ds);
			
 
				+
			
 
				+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
			
 
				+        u[2*i+0] = q8[0];
			
 
				+        u[2*i+1] = q8[4];
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
			
 
				+
			
 
				+    const int8_t * s = bq5_K->scales;
			
 
				+
			
 
				+    const float d = bq5_K->d;
			
 
				+
			
 
				+    const float d8_1 = __low2half(bq8_1[0].ds);
			
 
				+    const float d8_2 = __low2half(bq8_1[1].ds);
			
 
				+
			
 
				+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
			
 
				+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
			
 
				+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
			
 
				+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
			
 
				+
			
 
				+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
			
 
				+    const int vl1 = ql[0];
			
 
				+    const int vl2 = ql[4];
			
 
				+
			
 
				+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
			
 
				+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
			
 
				+    const int in = step%8; // 0, 4, 0, 4
			
 
				+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
			
 
				+
			
 
				+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
			
 
				+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
			
 
				+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
			
 
				+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
			
 
				+
			
 
				+    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
			
 
				+                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
			
 
				+
			
 
				+    return d * sumf_d;
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
			
 
				+
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
			
 
				+
			
 
				+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
			
 
				+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
			
 
				+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
			
 
				+
			
 
				+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
			
 
				+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
			
 
				+
			
 
				+    const int8_t * scales = bq6_K->scales + scale_offset;
			
 
				+
			
 
				+    int    u[QR6_K];
			
 
				+    float d8[QR6_K];
			
 
				+
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < QR6_K; ++i) {
			
 
				+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
			
 
				+        d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
			
 
				+    }
			
 
				+
			
 
				+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if QK_K == 256
			
 
				+    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
			
 
				+
			
 
				+#if QR2_XXS == 8
			
 
				+    const int ib32 = iqs;
			
 
				+    const uint16_t * q2 = bq2->qs + 4*ib32;
			
 
				+    const uint8_t  * aux8 = (const uint8_t *)q2;
			
 
				+    const int8_t   * q8 = bq8_1[ib32].qs;
			
 
				+    uint32_t aux32 = q2[2] | (q2[3] << 16);
			
 
				+    int sumi = 0;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
			
 
				+        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
			
 
				+        for (int j = 0; j < 8; ++j) {
			
 
				+            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
			
 
				+        }
			
 
				+        q8 += 8;
			
 
				+        aux32 >>= 7;
			
 
				+    }
			
 
				+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
			
 
				+    return d * sumi;
			
 
				+#else
			
 
				+    // iqs is 0...15
			
 
				+    const int ib32 = iqs/2;
			
 
				+    const int il = iqs%2;
			
 
				+    const uint16_t * q2 = bq2->qs + 4*ib32;
			
 
				+    const uint8_t  * aux8 = (const uint8_t *)q2;
			
 
				+    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
			
 
				+    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
			
 
				+    const uint32_t aux32 = q2[2] | (q2[3] << 16);
			
 
				+    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
			
 
				+    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
			
 
				+    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
			
 
				+    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
			
 
				+    int sumi1 = 0, sumi2 = 0;
			
 
				+    for (int j = 0; j < 8; ++j) {
			
 
				+        sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
			
 
				+        sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
			
 
				+    }
			
 
				+    return d * (sumi1 + sumi2);
			
 
				+#endif
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+#if QK_K == 256
			
 
				+    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
			
 
				+
			
 
				+    const int ib32 = iqs;
			
 
				+    const uint16_t * q2 = bq2->qs + 4*ib32;
			
 
				+    const int8_t   * q8 = bq8_1[ib32].qs;
			
 
				+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
			
 
				+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
			
 
				+    int sumi1 = 0;
			
 
				+    for (int l = 0; l < 2; ++l) {
			
 
				+        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
			
 
				+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
			
 
				+        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
			
 
				+        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
			
 
				+        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
			
 
				+        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+    int sumi2 = 0;
			
 
				+    for (int l = 2; l < 4; ++l) {
			
 
				+        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
			
 
				+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
			
 
				+        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
			
 
				+        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
			
 
				+        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
			
 
				+        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
			
 
				+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
			
 
				+#else
			
 
				+    GGML_UNUSED(ksigns64);
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+#else
			
 
				+    GGML_UNUSED(ksigns64);
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+// TODO
			
 
				+static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+#if QK_K == 256
			
 
				+    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
			
 
				+
			
 
				+    const int ib32 = iqs;
			
 
				+    const int8_t  * q8 = bq8_1[ib32].qs;
			
 
				+    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
			
 
				+    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
			
 
				+    const uint8_t ls2 = bq2->scales[ib32] >>  4;
			
 
				+    int sumi1 = 0;
			
 
				+    for (int l = 0; l < 2; ++l) {
			
 
				+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
			
 
				+        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
			
 
				+        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
			
 
				+        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
			
 
				+        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
			
 
				+        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
			
 
				+        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+    int sumi2 = 0;
			
 
				+    for (int l = 2; l < 4; ++l) {
			
 
				+        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
			
 
				+        const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
			
 
				+        const uint32_t signs1 = __vcmpeq4(((signs[l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
			
 
				+        const int grid_l = __vsub4(grid[0] ^ signs0, signs0);
			
 
				+        const int grid_h = __vsub4(grid[1] ^ signs1, signs1);
			
 
				+        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
			
 
				+        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
			
 
				+    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
			
 
				+#else
			
 
				+    GGML_UNUSED(ksigns64);
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+#else
			
 
				+    GGML_UNUSED(ksigns64);
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+#if QK_K == 256
			
 
				+    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
			
 
				+
			
 
				+    const int ib32 = iqs;
			
 
				+    const uint8_t  * q3 = bq2->qs + 8*ib32;
			
 
				+    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
			
 
				+    const int8_t   * q8 = bq8_1[ib32].qs;
			
 
				+    uint32_t aux32 = gas[0] | (gas[1] << 16);
			
 
				+    int sumi = 0;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
			
 
				+        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
			
 
				+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
			
 
				+        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
			
 
				+        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
			
 
				+        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
			
 
				+        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
			
 
				+        q8 += 8;
			
 
				+        aux32 >>= 7;
			
 
				+    }
			
 
				+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
			
 
				+    return d * sumi;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+// TODO: don't use lookup table for signs
			
 
				+static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+#if QK_K == 256
			
 
				+    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
			
 
				+
			
 
				+    const int ib32 = iqs;
			
 
				+    const uint8_t  * qs = bq2->qs + 8*ib32;
			
 
				+    const int8_t   * q8 = bq8_1[ib32].qs;
			
 
				+    int sumi = 0;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
			
 
				+        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
			
 
				+        uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
			
 
				+        uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >>  4) * 0x01010101) & 0x08040201, 0x08040201);
			
 
				+        const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
			
 
				+        const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
			
 
				+        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
			
 
				+        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+    const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
			
 
				+    return d * sumi;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if QK_K == 256
			
 
				+    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
			
 
				+
			
 
				+    const int ib32 = iqs;
			
 
				+    int sumi = 0;
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    const int * q8 = (const int *)bq8_1[ib32].qs;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
			
 
				+        int grid0 = grid[0] & 0x0f0f0f0f;
			
 
				+        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
			
 
				+        sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
			
 
				+    }
			
 
				+#else
			
 
				+    const int8_t * q8 = bq8_1[ib32].qs;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
			
 
				+        for (int j = 0; j < 4; ++j) {
			
 
				+            sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
			
 
				+        }
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+#endif
			
 
				+    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
			
 
				+    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
			
 
				+    const float d = d1q * __low2float (bq8_1[ib32].ds);
			
 
				+    const float m = d1q * __high2float(bq8_1[ib32].ds);
			
 
				+    return d * sumi + m * delta;
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+#if QK_K == 256
			
 
				+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
			
 
				+
			
 
				+    const int ib32 = iqs;
			
 
				+    int   sumi[2] = {0, 0};
			
 
				+    float sumf[2] = {0.f, 0.f};
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    const int * q8 = (const int *)bq8_1[ib32].qs;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
			
 
				+        int grid0 = grid[0] & 0x0f0f0f0f;
			
 
				+        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
			
 
				+        sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2]));
			
 
				+        const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
			
 
				+        const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0));
			
 
				+        sumf[l/2] += delta*sumy;
			
 
				+    }
			
 
				+#else
			
 
				+    const int8_t * q8 = bq8_1[ib32].qs;
			
 
				+    for (int l = 0; l < 4; ++l) {
			
 
				+        const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
			
 
				+        int sumy = 0;
			
 
				+        for (int j = 0; j < 4; ++j) {
			
 
				+            sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4);
			
 
				+            sumy += q8[j] + q8[j+4];
			
 
				+        }
			
 
				+        const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
			
 
				+        sumf[l/2] += delta*sumy;
			
 
				+        q8 += 8;
			
 
				+    }
			
 
				+#endif
			
 
				+    iq1m_scale_t scale;
			
 
				+    const uint16_t * sc = (const uint16_t *)bq1->scales;
			
 
				+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				+    const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
			
 
				+    return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
			
 
				+        int & val1, int & val2) {
			
 
				+
			
 
				+    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
			
 
				+    aux32 = q4 & 0x0f0f0f0f;
			
 
				+    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
			
 
				+    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
			
 
				+    val1 = v1 | (v2 << 16);
			
 
				+    aux32 = (q4 >> 4) & 0x0f0f0f0f;
			
 
				+    v1 = values[q8[0]] | (values[q8[1]] << 8);
			
 
				+    v2 = values[q8[2]] | (values[q8[3]] << 8);
			
 
				+    val2 = v1 | (v2 << 16);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
			
 
				+
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
			
 
				+    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
			
 
				+
			
 
				+    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
			
 
				+
			
 
				+    int v1, v2;
			
 
				+    int sumi1 = 0, sumi2 = 0;
			
 
				+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
			
 
				+        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
			
 
				+        get_int_from_table_16(aux, values, v1, v2);
			
 
				+        sumi1 = __dp4a(v1, q8[l+0], sumi1);
			
 
				+        sumi2 = __dp4a(v2, q8[l+4], sumi2);
			
 
				+    }
			
 
				+
			
 
				+#else
			
 
				+    const uint8_t * q4 = bq->qs + 4*iqs;
			
 
				+    const int8_t  * q8 = bq8_1->qs + 4*iqs;
			
 
				+
			
 
				+    int sumi1 = 0, sumi2 = 0;
			
 
				+    for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
			
 
				+        sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
			
 
				+        sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >>  4];
			
 
				+    }
			
 
				+#endif
			
 
				+    const float d = (float)bq->d * __low2float(bq8_1->ds);
			
 
				+    return d * (sumi1 + sumi2);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
			
 
				+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
			
 
				+
			
 
				+    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
			
 
				+    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
			
 
				+
			
 
				+    // iqs is 0...7
			
 
				+    const int ib32 = iqs;
			
 
				+    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
			
 
				+    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
			
 
				+    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
			
 
				+    const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
			
 
				+    int v1, v2;
			
 
				+    int sumi1 = 0, sumi2 = 0;
			
 
				+    for (int j = 0; j < 4; ++j) {
			
 
				+        get_int_from_table_16(q4[j], values, v1, v2);
			
 
				+        sumi1 = __dp4a(v1, q8[j+0], sumi1);
			
 
				+        sumi2 = __dp4a(v2, q8[j+4], sumi2);
			
 
				+    }
			
 
				+    return d * (sumi1 + sumi2);
			
 
				+
			
 
				+#else
			
 
				+    NO_DEVICE_CODE;
			
 
				+#endif
			
 
				+#else
			
 
				+    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
			
 
				+#endif
			
 
				+}
			
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -1,66 +1,66 @@
 
				-// An interface allowing to compute ggml_cgraph with Metal

			
 
				-//

			
 
				-// This is a fully functional interface that extends ggml with GPU support for Apple devices.

			
 
				-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)

			
 
				-//

			
 
				-// How it works?

			
 
				-//

			
 
				-// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this

			
 
				-// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you

			
 
				-// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)

			
 
				-//

			
 
				-// You only need to make sure that all memory buffers that you used during the graph creation

			
 
				-// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is

			
 
				-// used during the graph evaluation to determine the arguments of the compute kernels.

			
 
				-//

			
 
				-// Synchronization between device and host memory (for example for input and output tensors)

			
 
				-// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.

			
 
				-//

			
 
				-

			
 
				-#pragma once

			
 
				-

			
 
				-#include "ggml.h"

			
 
				-#include "ggml-backend.h"

			
 
				-

			
 
				-#include <stddef.h>

			
 
				-#include <stdbool.h>

			
 
				-

			
 
				-// max memory buffers that can be mapped to the device

			
 
				-#define GGML_METAL_MAX_BUFFERS 64

			
 
				-

			
 
				-struct ggml_tensor;

			
 
				-struct ggml_cgraph;

			
 
				-

			
 
				-#ifdef __cplusplus

			
 
				-extern "C" {

			
 
				-#endif

			
 
				-

			
 
				-//

			
 
				-// backend API

			
 
				-// user-code should use only these functions

			
 
				-//

			
 
				-

			
 
				-GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);

			
 
				-

			
 
				-GGML_API ggml_backend_t ggml_backend_metal_init(void);

			
 
				-

			
 
				-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

			
 
				-

			
 
				-GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);

			
 
				-

			
 
				-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);

			
 
				-

			
 
				-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

			
 
				-

			
 
				-// helper to check if the device supports a specific family

			
 
				-// ideally, the user code should be doing these checks

			
 
				-// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

			
 
				-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);

			
 
				-

			
 
				-// capture all command buffers committed the next time `ggml_backend_graph_compute` is called

			
 
				-GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);

			
 
				-

			
 
				-#ifdef __cplusplus

			
 
				-}

			
 
				-#endif

			
 
				-

			
 
				+// An interface allowing to compute ggml_cgraph with Metal
			
 
				+//
			
 
				+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
			
 
				+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
			
 
				+//
			
 
				+// How it works?
			
 
				+//
			
 
				+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
			
 
				+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
			
 
				+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
			
 
				+//
			
 
				+// You only need to make sure that all memory buffers that you used during the graph creation
			
 
				+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
			
 
				+// used during the graph evaluation to determine the arguments of the compute kernels.
			
 
				+//
			
 
				+// Synchronization between device and host memory (for example for input and output tensors)
			
 
				+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
			
 
				+//
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+#include "ggml-backend.h"
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+#include <stdbool.h>
			
 
				+
			
 
				+// max memory buffers that can be mapped to the device
			
 
				+#define GGML_METAL_MAX_BUFFERS 64
			
 
				+
			
 
				+struct ggml_tensor;
			
 
				+struct ggml_cgraph;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// backend API
			
 
				+// user-code should use only these functions
			
 
				+//
			
 
				+
			
 
				+GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
			
 
				+
			
 
				+GGML_API ggml_backend_t ggml_backend_metal_init(void);
			
 
				+
			
 
				+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
			
 
				+
			
 
				+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
			
 
				+
			
 
				+GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
			
 
				+
			
 
				+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
			
 
				+
			
 
				+// helper to check if the device supports a specific family
			
 
				+// ideally, the user code should be doing these checks
			
 
				+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
			
 
				+GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
			
 
				+
			
 
				+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
			
 
				+GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
--- a/llama/ggml-metal-darwin_arm64.m
+++ b/llama/ggml-metal-darwin_arm64.m
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -0,0 +1,6859 @@
 
				+#define GGML_COMMON_DECL_METAL
			
 
				+#define GGML_COMMON_IMPL_METAL
			
 
				+#include "ggml-common.h"
			
 
				+
			
 
				+#include <metal_stdlib>
			
 
				+
			
 
				+using namespace metal;
			
 
				+
			
 
				+#define MAX(x, y) ((x) > (y) ? (x) : (y))
			
 
				+#define MIN(x, y) ((x) < (y) ? (x) : (y))
			
 
				+#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
			
 
				+
			
 
				+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
			
 
				+
			
 
				+enum ggml_sort_order {
			
 
				+    GGML_SORT_ORDER_ASC,
			
 
				+    GGML_SORT_ORDER_DESC,
			
 
				+};
			
 
				+
			
 
				+// general-purpose kernel for addition, multiplication and division of two tensors
			
 
				+// pros: works for non-contiguous tensors, supports broadcast across all dims
			
 
				+// cons: not very efficient
			
 
				+kernel void kernel_add(
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device       char * dst,
			
 
				+        constant  int64_t & ne00,
			
 
				+        constant  int64_t & ne01,
			
 
				+        constant  int64_t & ne02,
			
 
				+        constant  int64_t & ne03,
			
 
				+        constant uint64_t & nb00,
			
 
				+        constant uint64_t & nb01,
			
 
				+        constant uint64_t & nb02,
			
 
				+        constant uint64_t & nb03,
			
 
				+        constant  int64_t & ne10,
			
 
				+        constant  int64_t & ne11,
			
 
				+        constant  int64_t & ne12,
			
 
				+        constant  int64_t & ne13,
			
 
				+        constant uint64_t & nb10,
			
 
				+        constant uint64_t & nb11,
			
 
				+        constant uint64_t & nb12,
			
 
				+        constant uint64_t & nb13,
			
 
				+        constant  int64_t & ne0,
			
 
				+        constant  int64_t & ne1,
			
 
				+        constant  int64_t & ne2,
			
 
				+        constant  int64_t & ne3,
			
 
				+        constant uint64_t & nb0,
			
 
				+        constant uint64_t & nb1,
			
 
				+        constant uint64_t & nb2,
			
 
				+        constant uint64_t & nb3,
			
 
				+        constant  int64_t & offs,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig.z;
			
 
				+    const int64_t i02 = tgpig.y;
			
 
				+    const int64_t i01 = tgpig.x;
			
 
				+
			
 
				+    const int64_t i13 = i03 % ne13;
			
 
				+    const int64_t i12 = i02 % ne12;
			
 
				+    const int64_t i11 = i01 % ne11;
			
 
				+
			
 
				+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
			
 
				+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
			
 
				+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        const int i10 = i0 % ne10;
			
 
				+        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) + *((device float *)(src1_ptr + i10*nb10));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_mul(
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device       char * dst,
			
 
				+        constant  int64_t & ne00,
			
 
				+        constant  int64_t & ne01,
			
 
				+        constant  int64_t & ne02,
			
 
				+        constant  int64_t & ne03,
			
 
				+        constant uint64_t & nb00,
			
 
				+        constant uint64_t & nb01,
			
 
				+        constant uint64_t & nb02,
			
 
				+        constant uint64_t & nb03,
			
 
				+        constant  int64_t & ne10,
			
 
				+        constant  int64_t & ne11,
			
 
				+        constant  int64_t & ne12,
			
 
				+        constant  int64_t & ne13,
			
 
				+        constant uint64_t & nb10,
			
 
				+        constant uint64_t & nb11,
			
 
				+        constant uint64_t & nb12,
			
 
				+        constant uint64_t & nb13,
			
 
				+        constant  int64_t & ne0,
			
 
				+        constant  int64_t & ne1,
			
 
				+        constant  int64_t & ne2,
			
 
				+        constant  int64_t & ne3,
			
 
				+        constant uint64_t & nb0,
			
 
				+        constant uint64_t & nb1,
			
 
				+        constant uint64_t & nb2,
			
 
				+        constant uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig.z;
			
 
				+    const int64_t i02 = tgpig.y;
			
 
				+    const int64_t i01 = tgpig.x;
			
 
				+
			
 
				+    const int64_t i13 = i03 % ne13;
			
 
				+    const int64_t i12 = i02 % ne12;
			
 
				+    const int64_t i11 = i01 % ne11;
			
 
				+
			
 
				+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
			
 
				+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
			
 
				+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        const int i10 = i0 % ne10;
			
 
				+        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) * *((device float *)(src1_ptr + i10*nb10));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_div(
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device       char * dst,
			
 
				+        constant  int64_t & ne00,
			
 
				+        constant  int64_t & ne01,
			
 
				+        constant  int64_t & ne02,
			
 
				+        constant  int64_t & ne03,
			
 
				+        constant uint64_t & nb00,
			
 
				+        constant uint64_t & nb01,
			
 
				+        constant uint64_t & nb02,
			
 
				+        constant uint64_t & nb03,
			
 
				+        constant  int64_t & ne10,
			
 
				+        constant  int64_t & ne11,
			
 
				+        constant  int64_t & ne12,
			
 
				+        constant  int64_t & ne13,
			
 
				+        constant uint64_t & nb10,
			
 
				+        constant uint64_t & nb11,
			
 
				+        constant uint64_t & nb12,
			
 
				+        constant uint64_t & nb13,
			
 
				+        constant  int64_t & ne0,
			
 
				+        constant  int64_t & ne1,
			
 
				+        constant  int64_t & ne2,
			
 
				+        constant  int64_t & ne3,
			
 
				+        constant uint64_t & nb0,
			
 
				+        constant uint64_t & nb1,
			
 
				+        constant uint64_t & nb2,
			
 
				+        constant uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig.z;
			
 
				+    const int64_t i02 = tgpig.y;
			
 
				+    const int64_t i01 = tgpig.x;
			
 
				+
			
 
				+    const int64_t i13 = i03 % ne13;
			
 
				+    const int64_t i12 = i02 % ne12;
			
 
				+    const int64_t i11 = i01 % ne11;
			
 
				+
			
 
				+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
			
 
				+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
			
 
				+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        const int i10 = i0 % ne10;
			
 
				+        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) / *((device float *)(src1_ptr + i10*nb10));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// assumption: src1 is a row
			
 
				+// broadcast src1 into src0
			
 
				+kernel void kernel_add_row(
			
 
				+        device const float4 * src0,
			
 
				+        device const float4 * src1,
			
 
				+        device       float4 * dst,
			
 
				+        constant   uint64_t & nb [[buffer(28)]],
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] + src1[tpig % nb];
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_mul_row(
			
 
				+        device const float4 * src0,
			
 
				+        device const float4 * src1,
			
 
				+        device       float4 * dst,
			
 
				+        constant   uint64_t & nb  [[buffer(28)]],
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] * src1[tpig % nb];
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_div_row(
			
 
				+        device const float4 * src0,
			
 
				+        device const float4 * src1,
			
 
				+        device       float4 * dst,
			
 
				+        constant   uint64_t & nb  [[buffer(28)]],
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] / src1[tpig % nb];
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_scale(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant     float & scale,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] * scale;
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_scale_4(
			
 
				+        device const float4 * src0,
			
 
				+        device       float4 * dst,
			
 
				+        constant     float  & scale,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] * scale;
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_clamp(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant     float & min,
			
 
				+        constant     float & max,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] < min ? min : (src0[tpig] > max ? max : src0[tpig]);
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_relu(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = max(0.0f, src0[tpig]);
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_sigmoid(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig]));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_tanh(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float & x = src0[tpig];
			
 
				+    dst[tpig] = precise::tanh(x);
			
 
				+}
			
 
				+
			
 
				+constant float GELU_COEF_A     = 0.044715f;
			
 
				+constant float GELU_QUICK_COEF = -1.702f;
			
 
				+constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
			
 
				+
			
 
				+kernel void kernel_gelu(
			
 
				+    device const float * src0,
			
 
				+    device       float * dst,
			
 
				+    uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float & x = src0[tpig];
			
 
				+
			
 
				+    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_gelu_4(
			
 
				+    device const float4 * src0,
			
 
				+    device       float4 * dst,
			
 
				+    uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float4 & x = src0[tpig];
			
 
				+
			
 
				+    // BEWARE !!!
			
 
				+    // Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
			
 
				+    // This was observed with Falcon 7B and 40B models
			
 
				+    //
			
 
				+    dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_gelu_quick(
			
 
				+    device const float * src0,
			
 
				+    device       float * dst,
			
 
				+    uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float & x = src0[tpig];
			
 
				+
			
 
				+    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_gelu_quick_4(
			
 
				+    device const float4 * src0,
			
 
				+    device       float4 * dst,
			
 
				+    uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float4 & x = src0[tpig];
			
 
				+
			
 
				+    dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_silu(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float & x = src0[tpig];
			
 
				+    dst[tpig] = x / (1.0f + exp(-x));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_silu_4(
			
 
				+        device const float4 * src0,
			
 
				+        device       float4 * dst,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    device const float4 & x = src0[tpig];
			
 
				+    dst[tpig] = x / (1.0f + exp(-x));
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_sqr(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] * src0[tpig];
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_sum_rows(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant  int64_t & ne00,
			
 
				+        constant  int64_t & ne01,
			
 
				+        constant  int64_t & ne02,
			
 
				+        constant  int64_t & ne03,
			
 
				+        constant uint64_t & nb00,
			
 
				+        constant uint64_t & nb01,
			
 
				+        constant uint64_t & nb02,
			
 
				+        constant uint64_t & nb03,
			
 
				+        constant  int64_t & ne10,
			
 
				+        constant  int64_t & ne11,
			
 
				+        constant  int64_t & ne12,
			
 
				+        constant  int64_t & ne13,
			
 
				+        constant uint64_t & nb10,
			
 
				+        constant uint64_t & nb11,
			
 
				+        constant uint64_t & nb12,
			
 
				+        constant uint64_t & nb13,
			
 
				+        constant  int64_t & ne0,
			
 
				+        constant  int64_t & ne1,
			
 
				+        constant  int64_t & ne2,
			
 
				+        constant  int64_t & ne3,
			
 
				+        constant uint64_t & nb0,
			
 
				+        constant uint64_t & nb1,
			
 
				+        constant uint64_t & nb2,
			
 
				+        constant uint64_t & nb3,
			
 
				+        uint3 tpig[[thread_position_in_grid]]) {
			
 
				+    int64_t i3 = tpig.z;
			
 
				+    int64_t i2 = tpig.y;
			
 
				+    int64_t i1 = tpig.x;
			
 
				+
			
 
				+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    device const float * src_row = (device const float *) ((device const char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
			
 
				+    device       float * dst_row = (device       float *) ((device       char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
			
 
				+
			
 
				+    float row_sum = 0;
			
 
				+
			
 
				+    for (int64_t i0 = 0; i0 < ne00; i0++) {
			
 
				+        row_sum += src_row[i0];
			
 
				+    }
			
 
				+
			
 
				+    dst_row[0] = row_sum;
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_soft_max(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device        char * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant     float & scale,
			
 
				+        constant     float & max_bias,
			
 
				+        constant     float & m0,
			
 
				+        constant     float & m1,
			
 
				+        constant  uint32_t & n_head_log2,
			
 
				+        threadgroup  float * buf [[threadgroup(0)]],
			
 
				+        uint  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint    ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = (tgpig) / (ne02*ne01);
			
 
				+    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
			
 
				+    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
			
 
				+
			
 
				+    device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
			
 
				+    device const     T * pmask = src1 != src0 ? (device const    T *) src1         + i01*ne00 : nullptr;
			
 
				+    device       float * pdst  = (device       float *) dst  + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
			
 
				+
			
 
				+    float slope = 1.0f;
			
 
				+
			
 
				+    // ALiBi
			
 
				+    if (max_bias > 0.0f) {
			
 
				+        const int64_t h = i02;
			
 
				+
			
 
				+        const float base = h < n_head_log2 ? m0 : m1;
			
 
				+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
			
 
				+
			
 
				+        slope = pow(base, exp);
			
 
				+    }
			
 
				+
			
 
				+    // parallel max
			
 
				+    float lmax = -INFINITY;
			
 
				+
			
 
				+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
			
 
				+        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
			
 
				+    }
			
 
				+
			
 
				+    // find the max value in the block
			
 
				+    float max_val = simd_max(lmax);
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = -INFINITY;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = max_val;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        max_val = buf[tiisg];
			
 
				+        max_val = simd_max(max_val);
			
 
				+    }
			
 
				+
			
 
				+    // parallel sum
			
 
				+    float lsum = 0.0f;
			
 
				+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
			
 
				+        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
			
 
				+        lsum += exp_psrc0;
			
 
				+        pdst[i00] = exp_psrc0;
			
 
				+    }
			
 
				+
			
 
				+    // This barrier fixes a failing test
			
 
				+    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
			
 
				+    threadgroup_barrier(mem_flags::mem_none);
			
 
				+
			
 
				+    float sum = simd_sum(lsum);
			
 
				+
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = 0.0f;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = sum;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        sum = buf[tiisg];
			
 
				+        sum = simd_sum(sum);
			
 
				+    }
			
 
				+
			
 
				+    const float inv_sum = 1.0f/sum;
			
 
				+
			
 
				+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
			
 
				+        pdst[i00] *= inv_sum;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_soft_max_4(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device        char * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant     float & scale,
			
 
				+        constant     float & max_bias,
			
 
				+        constant     float & m0,
			
 
				+        constant     float & m1,
			
 
				+        constant  uint32_t & n_head_log2,
			
 
				+        threadgroup  float * buf [[threadgroup(0)]],
			
 
				+        uint  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint    ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = (tgpig) / (ne02*ne01);
			
 
				+    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
			
 
				+    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
			
 
				+
			
 
				+    device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
			
 
				+    device const      T * pmask = src1 != src0 ? (device const     T *) src1         + i01*ne00/4 : nullptr;
			
 
				+    device       float4 * pdst4 = (device       float4 *) dst  + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
			
 
				+
			
 
				+    float slope = 1.0f;
			
 
				+
			
 
				+    if (max_bias > 0.0f) {
			
 
				+        const int64_t h = i02;
			
 
				+
			
 
				+        const float base = h < n_head_log2 ? m0 : m1;
			
 
				+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
			
 
				+
			
 
				+        slope = pow(base, exp);
			
 
				+    }
			
 
				+
			
 
				+    // parallel max
			
 
				+    float4 lmax4 = -INFINITY;
			
 
				+
			
 
				+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
			
 
				+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
			
 
				+    }
			
 
				+
			
 
				+    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
			
 
				+
			
 
				+    float max_val = simd_max(lmax);
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = -INFINITY;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = max_val;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        max_val = buf[tiisg];
			
 
				+        max_val = simd_max(max_val);
			
 
				+    }
			
 
				+
			
 
				+    // parallel sum
			
 
				+    float4 lsum4 = 0.0f;
			
 
				+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
			
 
				+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
			
 
				+        lsum4 += exp_psrc4;
			
 
				+        pdst4[i00] = exp_psrc4;
			
 
				+    }
			
 
				+
			
 
				+    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
			
 
				+
			
 
				+    // This barrier fixes a failing test
			
 
				+    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
			
 
				+    threadgroup_barrier(mem_flags::mem_none);
			
 
				+
			
 
				+    float sum = simd_sum(lsum);
			
 
				+
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = 0.0f;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = sum;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        sum = buf[tiisg];
			
 
				+        sum = simd_sum(sum);
			
 
				+    }
			
 
				+
			
 
				+    const float inv_sum = 1.0f/sum;
			
 
				+
			
 
				+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
			
 
				+        pdst4[i00] *= inv_sum;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+typedef decltype(kernel_soft_max<float>)    kernel_soft_max_t;
			
 
				+typedef decltype(kernel_soft_max_4<float4>) kernel_soft_max_4_t;
			
 
				+
			
 
				+template [[host_name("kernel_soft_max_f16")]]   kernel kernel_soft_max_t   kernel_soft_max<half>;
			
 
				+template [[host_name("kernel_soft_max_f32")]]   kernel kernel_soft_max_t   kernel_soft_max<float>;
			
 
				+template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<half4>;
			
 
				+template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<float4>;
			
 
				+
			
 
				+kernel void kernel_diag_mask_inf(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant       int & n_past,
			
 
				+        uint3 tpig[[thread_position_in_grid]]) {
			
 
				+    const int64_t i02 = tpig[2];
			
 
				+    const int64_t i01 = tpig[1];
			
 
				+    const int64_t i00 = tpig[0];
			
 
				+
			
 
				+    if (i00 > n_past + i01) {
			
 
				+        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
			
 
				+    } else {
			
 
				+        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_diag_mask_inf_8(
			
 
				+        device const float4 * src0,
			
 
				+        device       float4 * dst,
			
 
				+        constant    int64_t & ne00,
			
 
				+        constant    int64_t & ne01,
			
 
				+        constant        int & n_past,
			
 
				+        uint3 tpig[[thread_position_in_grid]]) {
			
 
				+
			
 
				+    const int64_t i = 2*tpig[0];
			
 
				+
			
 
				+    dst[i+0] = src0[i+0];
			
 
				+    dst[i+1] = src0[i+1];
			
 
				+    int64_t i4 = 4*i;
			
 
				+    const int64_t i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
			
 
				+    const int64_t i01 = i4/(ne00);      i4 -= i01*ne00;
			
 
				+    const int64_t i00 = i4;
			
 
				+    for (int k = 3; k >= 0; --k) {
			
 
				+        if (i00 + 4 + k <= n_past + i01) {
			
 
				+            break;
			
 
				+        }
			
 
				+        dst[i+1][k] = -INFINITY;
			
 
				+        if (i00 + k > n_past + i01) {
			
 
				+            dst[i][k] = -INFINITY;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_norm(
			
 
				+        device const  void * src0,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant     float & eps,
			
 
				+        threadgroup float  * sum [[threadgroup(0)]],
			
 
				+        uint tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint   ntg[[threads_per_threadgroup]]) {
			
 
				+    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
			
 
				+    // MEAN
			
 
				+    // parallel sum
			
 
				+    sum[tpitg] = 0.0f;
			
 
				+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
			
 
				+        sum[tpitg] += x[i00];
			
 
				+    }
			
 
				+    // reduce
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    for (uint i = ntg/2; i > 0; i /= 2) {
			
 
				+        if (tpitg < i) {
			
 
				+            sum[tpitg] += sum[tpitg + i];
			
 
				+        }
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+    const float mean  = sum[0] / ne00;
			
 
				+
			
 
				+    // recenter and VARIANCE
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    device float * y = dst + tgpig*ne00;
			
 
				+    sum[tpitg] = 0.0f;
			
 
				+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
			
 
				+        y[i00] = x[i00] - mean;
			
 
				+        sum[tpitg] += y[i00] * y[i00];
			
 
				+    }
			
 
				+
			
 
				+    // reduce
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    for (uint i = ntg/2; i > 0; i /= 2) {
			
 
				+        if (tpitg < i) {
			
 
				+            sum[tpitg] += sum[tpitg + i];
			
 
				+        }
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+    const float variance = sum[0] / ne00;
			
 
				+
			
 
				+    const float scale = 1.0f/sqrt(variance + eps);
			
 
				+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
			
 
				+        y[i00] = y[i00] * scale;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_rms_norm(
			
 
				+        device const  void * src0,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant     float & eps,
			
 
				+        threadgroup float  * buf [[threadgroup(0)]],
			
 
				+        uint tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint sgitg[[simdgroup_index_in_threadgroup]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint   ntg[[threads_per_threadgroup]]) {
			
 
				+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
			
 
				+
			
 
				+    float4 sumf = 0;
			
 
				+    float all_sum = 0;
			
 
				+
			
 
				+    // parallel sum
			
 
				+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
			
 
				+        sumf += x[i00] * x[i00];
			
 
				+    }
			
 
				+    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
			
 
				+    all_sum = simd_sum(all_sum);
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = 0.0f;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = all_sum;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        all_sum = buf[tiisg];
			
 
				+        all_sum = simd_sum(all_sum);
			
 
				+    }
			
 
				+
			
 
				+    const float mean  = all_sum/ne00;
			
 
				+    const float scale = 1.0f/sqrt(mean + eps);
			
 
				+
			
 
				+    device float4 * y = (device float4 *) (dst + tgpig*ne00);
			
 
				+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
			
 
				+        y[i00] = x[i00] * scale;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_group_norm(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int32_t & n_groups,
			
 
				+        constant     float & eps,
			
 
				+        threadgroup float  * buf [[threadgroup(0)]],
			
 
				+        uint tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint sgitg[[simdgroup_index_in_threadgroup]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t ne = ne00*ne01*ne02;
			
 
				+    const int64_t gs = ne00*ne01*((ne02 + n_groups - 1) / n_groups);
			
 
				+
			
 
				+    int start = tgpig * gs;
			
 
				+    int end   = start + gs;
			
 
				+
			
 
				+    start += tpitg;
			
 
				+
			
 
				+    if (end >= ne) {
			
 
				+        end = ne;
			
 
				+    }
			
 
				+
			
 
				+    float tmp = 0.0f; // partial sum for thread in warp
			
 
				+
			
 
				+    for (int j = start; j < end; j += ntg) {
			
 
				+        tmp += src0[j];
			
 
				+    }
			
 
				+
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    tmp = simd_sum(tmp);
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = 0.0f;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = tmp;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        tmp = buf[tiisg];
			
 
				+        tmp = simd_sum(tmp);
			
 
				+    }
			
 
				+
			
 
				+    const float mean = tmp / gs;
			
 
				+    tmp = 0.0f;
			
 
				+
			
 
				+    for (int j = start; j < end; j += ntg) {
			
 
				+        float xi = src0[j] - mean;
			
 
				+        dst[j] = xi;
			
 
				+        tmp += xi * xi;
			
 
				+    }
			
 
				+
			
 
				+    tmp = simd_sum(tmp);
			
 
				+    if (ntg > N_SIMDWIDTH) {
			
 
				+        if (sgitg == 0) {
			
 
				+            buf[tiisg] = 0.0f;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        if (tiisg == 0) {
			
 
				+            buf[sgitg] = tmp;
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        tmp = buf[tiisg];
			
 
				+        tmp = simd_sum(tmp);
			
 
				+    }
			
 
				+
			
 
				+    const float variance = tmp / gs;
			
 
				+    const float scale = 1.0f/sqrt(variance + eps);
			
 
				+    for (int j = start; j < end; j += ntg) {
			
 
				+        dst[j] *= scale;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
			
 
				+// il indicates where the q4 quants begin (0 or QK4_0/4)
			
 
				+// we assume that the yl's have been multiplied with the appropriate scale factor
			
 
				+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
			
 
				+inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
			
 
				+    float d = qb_curr->d;
			
 
				+
			
 
				+    float2 acc = 0.f;
			
 
				+
			
 
				+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
			
 
				+
			
 
				+    for (int i = 0; i < 8; i+=2) {
			
 
				+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
			
 
				+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
			
 
				+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
			
 
				+                + yl[i + 9] * (qs[i / 2] & 0xF000);
			
 
				+    }
			
 
				+    return d * (sumy * -8.f + acc[0] + acc[1]);
			
 
				+}
			
 
				+
			
 
				+// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
			
 
				+// il indicates where the q4 quants begin (0 or QK4_0/4)
			
 
				+// we assume that the yl's have been multiplied with the appropriate scale factor
			
 
				+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
			
 
				+inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
			
 
				+    float d = qb_curr->d;
			
 
				+    float m = qb_curr->m;
			
 
				+
			
 
				+    float2 acc = 0.f;
			
 
				+
			
 
				+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
			
 
				+
			
 
				+    for (int i = 0; i < 8; i+=2) {
			
 
				+        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
			
 
				+                + yl[i + 1] * (qs[i / 2] & 0x0F00);
			
 
				+        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
			
 
				+                + yl[i + 9] * (qs[i / 2] & 0xF000);
			
 
				+    }
			
 
				+    return d * (acc[0] + acc[1]) + sumy * m;
			
 
				+}
			
 
				+
			
 
				+// function for calculate inner product between half a q5_0 block and 16 floats (yl), sumy is SUM(yl[i])
			
 
				+// il indicates where the q5 quants begin (0 or QK5_0/4)
			
 
				+// we assume that the yl's have been multiplied with the appropriate scale factor
			
 
				+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
			
 
				+inline float block_q_n_dot_y(device const block_q5_0 * qb_curr, float sumy, thread float * yl, int il) {
			
 
				+    float d = qb_curr->d;
			
 
				+
			
 
				+    float2 acc = 0.f;
			
 
				+
			
 
				+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 3 + il/2);
			
 
				+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
			
 
				+
			
 
				+    for (int i = 0; i < 8; i+=2) {
			
 
				+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
			
 
				+                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
			
 
				+        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
			
 
				+                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
			
 
				+    }
			
 
				+    return d * (sumy * -16.f + acc[0] + acc[1]);
			
 
				+}
			
 
				+
			
 
				+// function for calculate inner product between half a q5_1 block and 16 floats (yl), sumy is SUM(yl[i])
			
 
				+// il indicates where the q5 quants begin (0 or QK5_1/4)
			
 
				+// we assume that the yl's have been multiplied with the appropriate scale factor
			
 
				+// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
			
 
				+inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thread float * yl, int il) {
			
 
				+    float d = qb_curr->d;
			
 
				+    float m = qb_curr->m;
			
 
				+
			
 
				+    float2 acc = 0.f;
			
 
				+
			
 
				+    device const uint16_t * qs =  ((device const uint16_t *)qb_curr + 4 + il/2);
			
 
				+           const uint32_t   qh = *((device const uint32_t *)qb_curr->qh);
			
 
				+
			
 
				+    for (int i = 0; i < 8; i+=2) {
			
 
				+        acc[0] += yl[i + 0] * ((qs[i / 2] & 0x000F) | ((qh >> (i+0+il        ) << 4 ) & 0x00010))
			
 
				+                + yl[i + 1] * ((qs[i / 2] & 0x0F00) | ((qh >> (i+1+il        ) << 12) & 0x01000));
			
 
				+        acc[1] += yl[i + 8] * ((qs[i / 2] & 0x00F0) | ((qh >> (i+0+il+QK5_0/2) << 8 ) & 0x00100))
			
 
				+                + yl[i + 9] * ((qs[i / 2] & 0xF000) | ((qh >> (i+1+il+QK5_0/2) << 16) & 0x10000));
			
 
				+    }
			
 
				+    return d * (acc[0] + acc[1]) + sumy * m;
			
 
				+}
			
 
				+
			
 
				+// putting them in the kernel cause a significant performance penalty
			
 
				+#define N_DST 4        // each SIMD group works on 4 rows
			
 
				+#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
			
 
				+//Note: This is a template, but strictly speaking it only applies to
			
 
				+//      quantizations where the block size is 32. It also does not
			
 
				+//      guard against the number of rows not being divisible by
			
 
				+//      N_DST, so this is another explicit assumption of the implementation.
			
 
				+template<typename block_q_type, int nr, int nsg, int nw>
			
 
				+void mul_vec_q_n_f32_impl(
			
 
				+        device const void  * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3 tgpig, uint tiisg, uint sgitg) {
			
 
				+    const int nb = ne00/QK4_0;
			
 
				+
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * nsg + sgitg) * nr;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
			
 
				+    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[16]; // src1 vector cache
			
 
				+    float sumf[nr] = {0.f};
			
 
				+
			
 
				+    const int ix = (tiisg/2);
			
 
				+    const int il = (tiisg%2)*8;
			
 
				+
			
 
				+    device const float * yb = y + ix * QK4_0 + il;
			
 
				+
			
 
				+    // each thread in a SIMD group deals with half a block.
			
 
				+    for (int ib = ix; ib < nb; ib += nw/2) {
			
 
				+        float sumy = 0;
			
 
				+        for (int i = 0; i < 8; i += 2) {
			
 
				+            sumy += yb[i] + yb[i+1];
			
 
				+            yl[i+0] = yb[i+ 0];
			
 
				+            yl[i+1] = yb[i+ 1]/256.f;
			
 
				+
			
 
				+            sumy += yb[i+16] + yb[i+17];
			
 
				+            yl[i+8] = yb[i+16]/16.f;
			
 
				+            yl[i+9] = yb[i+17]/4096.f;
			
 
				+        }
			
 
				+
			
 
				+        for (int row = 0; row < nr; row++) {
			
 
				+            sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
			
 
				+        }
			
 
				+
			
 
				+        yb += QK4_0 * 16;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < nr; ++row) {
			
 
				+        const float tot = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0 && first_row + row < ne01) {
			
 
				+            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_mul_mv_q4_0_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_mul_mv_q4_1_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+     mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_mul_mv_q5_0_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_mul_mv_q5_1_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#define NB_Q8_0 8
			
 
				+
			
 
				+void kernel_mul_mv_q8_0_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+    const int nr  = N_DST;
			
 
				+    const int nsg = N_SIMDGROUP;
			
 
				+    const int nw  = N_SIMDWIDTH;
			
 
				+
			
 
				+    const int nb = ne00/QK8_0;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * nsg + sgitg) * nr;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q8_0 * x = (device const block_q8_0 *) src0 + offset0;
			
 
				+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[NB_Q8_0];
			
 
				+    float sumf[nr]={0.f};
			
 
				+
			
 
				+    const int ix = tiisg/4;
			
 
				+    const int il = tiisg%4;
			
 
				+
			
 
				+    device const float * yb = y + ix * QK8_0 + NB_Q8_0*il;
			
 
				+
			
 
				+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
			
 
				+    for (int ib = ix; ib < nb; ib += nw/4) {
			
 
				+        for (int i = 0; i < NB_Q8_0; ++i) {
			
 
				+            yl[i] = yb[i];
			
 
				+        }
			
 
				+
			
 
				+        for (int row = 0; row < nr; row++) {
			
 
				+            device const int8_t * qs = x[ib+row*nb].qs + NB_Q8_0*il;
			
 
				+            float sumq = 0.f;
			
 
				+            for (int iq = 0; iq < NB_Q8_0; ++iq) {
			
 
				+                sumq += qs[iq] * yl[iq];
			
 
				+            }
			
 
				+            sumf[row] += sumq*x[ib+row*nb].d;
			
 
				+        }
			
 
				+
			
 
				+        yb += NB_Q8_0 * nw;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < nr; ++row) {
			
 
				+        const float tot = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0 && first_row + row < ne01) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_q8_0_f32")]]
			
 
				+kernel void kernel_mul_mv_q8_0_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg);
			
 
				+}
			
 
				+
			
 
				+#define N_F32_F32 4
			
 
				+
			
 
				+void kernel_mul_mv_f32_f32_impl(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                  uint64_t   nb00,
			
 
				+                  uint64_t   nb01,
			
 
				+                  uint64_t   nb02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne11,
			
 
				+                   int64_t   ne12,
			
 
				+                  uint64_t   nb10,
			
 
				+                  uint64_t   nb11,
			
 
				+                  uint64_t   nb12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                     uint    r2,
			
 
				+                     uint    r3,
			
 
				+                     uint3   tgpig,
			
 
				+                     uint    tiisg) {
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t rb = tgpig.y*N_F32_F32;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
			
 
				+
			
 
				+    device const float * x = (device const float *) (src0 + offset0);
			
 
				+
			
 
				+    if (ne00 < 128) {
			
 
				+        for (int row = 0; row < N_F32_F32; ++row) {
			
 
				+            int r1 = rb + row;
			
 
				+            if (r1 >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
			
 
				+
			
 
				+            float sumf = 0;
			
 
				+            for (int i = tiisg; i < ne00; i += 32) {
			
 
				+                sumf += (float) x[i] * (float) y[i];
			
 
				+            }
			
 
				+
			
 
				+            float all_sum = simd_sum(sumf);
			
 
				+            if (tiisg == 0) {
			
 
				+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+            }
			
 
				+        }
			
 
				+    } else {
			
 
				+        device const float4 * x4 = (device const float4 *)x;
			
 
				+        for (int row = 0; row < N_F32_F32; ++row) {
			
 
				+            int r1 = rb + row;
			
 
				+            if (r1 >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
			
 
				+            device const float4 * y4 = (device const float4 *) y;
			
 
				+
			
 
				+            float sumf = 0;
			
 
				+            for (int i = tiisg; i < ne00/4; i += 32) {
			
 
				+                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
			
 
				+            }
			
 
				+
			
 
				+            float all_sum = simd_sum(sumf);
			
 
				+            if (tiisg == 0) {
			
 
				+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
			
 
				+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_f32_f32")]]
			
 
				+kernel void kernel_mul_mv_f32_f32(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]]) {
			
 
				+    kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
			
 
				+}
			
 
				+
			
 
				+#define N_F16_F16 4
			
 
				+
			
 
				+kernel void kernel_mul_mv_f16_f16(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]]) {
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t rb = tgpig.y*N_F16_F16;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
			
 
				+
			
 
				+    device const half * x = (device const half *) (src0 + offset0);
			
 
				+
			
 
				+    if (ne00 < 128) {
			
 
				+        for (int row = 0; row < N_F16_F16; ++row) {
			
 
				+            int r1 = rb + row;
			
 
				+            if (r1 >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
			
 
				+
			
 
				+            float sumf = 0;
			
 
				+            for (int i = tiisg; i < ne00; i += 32) {
			
 
				+                sumf += (half) x[i] * (half) y[i];
			
 
				+            }
			
 
				+
			
 
				+            float all_sum = simd_sum(sumf);
			
 
				+            if (tiisg == 0) {
			
 
				+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+            }
			
 
				+        }
			
 
				+    } else {
			
 
				+        device const half4 * x4 = (device const half4 *)x;
			
 
				+        for (int row = 0; row < N_F16_F16; ++row) {
			
 
				+            int r1 = rb + row;
			
 
				+            if (r1 >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            device const half  * y  = (device const half  *) (src1 + r1*nb11 + im*nb12);
			
 
				+            device const half4 * y4 = (device const half4 *) y;
			
 
				+
			
 
				+            float sumf = 0;
			
 
				+            for (int i = tiisg; i < ne00/4; i += 32) {
			
 
				+                for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
			
 
				+            }
			
 
				+
			
 
				+            float all_sum = simd_sum(sumf);
			
 
				+            if (tiisg == 0) {
			
 
				+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
			
 
				+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_f16_f32_1row_impl(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]]) {
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t r1 = tgpig.y;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
			
 
				+
			
 
				+    device const half  * x = (device const half  *) (src0 + offset0);
			
 
				+    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
			
 
				+
			
 
				+    float sumf = 0;
			
 
				+    if (ne00 < 128) {
			
 
				+        for (int i = tiisg; i < ne00; i += 32) {
			
 
				+            sumf += (float) x[i] * (float) y[i];
			
 
				+        }
			
 
				+        float all_sum = simd_sum(sumf);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+        }
			
 
				+    } else {
			
 
				+        device const half4  * x4 = (device const half4  *) x;
			
 
				+        device const float4 * y4 = (device const float4 *) y;
			
 
				+        for (int i = tiisg; i < ne00/4; i += 32) {
			
 
				+            for (int k = 0; k < 4; ++k) sumf += (float)x4[i][k] * y4[i][k];
			
 
				+        }
			
 
				+        float all_sum = simd_sum(sumf);
			
 
				+        if (tiisg == 0) {
			
 
				+            for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
			
 
				+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_f16_f32_1row")]]
			
 
				+kernel void kernel_mul_mv_f16_f32_1row(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]]) {
			
 
				+    kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
			
 
				+}
			
 
				+
			
 
				+#define N_F16_F32 4
			
 
				+
			
 
				+void kernel_mul_mv_f16_f32_impl(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                  uint64_t   nb00,
			
 
				+                  uint64_t   nb01,
			
 
				+                  uint64_t   nb02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne11,
			
 
				+                   int64_t   ne12,
			
 
				+                  uint64_t   nb10,
			
 
				+                  uint64_t   nb11,
			
 
				+                  uint64_t   nb12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg) {
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t rb = tgpig.y*N_F16_F32;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
			
 
				+
			
 
				+    device const half * x = (device const half *) (src0 + offset0);
			
 
				+
			
 
				+    if (ne00 < 128) {
			
 
				+        for (int row = 0; row < N_F16_F32; ++row) {
			
 
				+            int r1 = rb + row;
			
 
				+            if (r1 >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
			
 
				+
			
 
				+            float sumf = 0;
			
 
				+            for (int i = tiisg; i < ne00; i += 32) {
			
 
				+                sumf += (float) x[i] * (float) y[i];
			
 
				+            }
			
 
				+
			
 
				+            float all_sum = simd_sum(sumf);
			
 
				+            if (tiisg == 0) {
			
 
				+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+            }
			
 
				+        }
			
 
				+    } else {
			
 
				+        device const half4 * x4 = (device const half4 *)x;
			
 
				+        for (int row = 0; row < N_F16_F32; ++row) {
			
 
				+            int r1 = rb + row;
			
 
				+            if (r1 >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            device const float  * y  = (device const float  *) (src1 + r1*nb11 + im*nb12);
			
 
				+            device const float4 * y4 = (device const float4 *) y;
			
 
				+
			
 
				+            float sumf = 0;
			
 
				+            for (int i = tiisg; i < ne00/4; i += 32) {
			
 
				+                for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
			
 
				+            }
			
 
				+
			
 
				+            float all_sum = simd_sum(sumf);
			
 
				+            if (tiisg == 0) {
			
 
				+                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (float) x[i] * y[i];
			
 
				+                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_f16_f32")]]
			
 
				+kernel void kernel_mul_mv_f16_f32(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]]) {
			
 
				+    kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg);
			
 
				+}
			
 
				+
			
 
				+// Assumes row size (ne00) is a multiple of 4
			
 
				+kernel void kernel_mul_mv_f16_f32_l4(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]]) {
			
 
				+
			
 
				+    const int nrows = ne11;
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb02*ne02;
			
 
				+
			
 
				+    device const half4 * x4 = (device const half4 *) (src0 + offset0);
			
 
				+
			
 
				+    for (int r1 = 0; r1 < nrows; ++r1) {
			
 
				+        device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
			
 
				+
			
 
				+        float sumf = 0;
			
 
				+        for (int i = tiisg; i < ne00/4; i += 32) {
			
 
				+            for (int k = 0; k < 4; ++k) sumf += (float) x4[i][k] * y4[i][k];
			
 
				+        }
			
 
				+
			
 
				+        float all_sum = simd_sum(sumf);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static float rope_yarn_ramp(const float low, const float high, const int i0) {
			
 
				+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
			
 
				+    return 1.0f - min(1.0f, max(0.0f, y));
			
 
				+}
			
 
				+
			
 
				+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
			
 
				+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
			
 
				+static void rope_yarn(
			
 
				+    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
			
 
				+    thread float * cos_theta, thread float * sin_theta
			
 
				+) {
			
 
				+    // Get n-d rotational scaling corrected for extrapolation
			
 
				+    float theta_interp = freq_scale * theta_extrap;
			
 
				+    float theta = theta_interp;
			
 
				+    if (ext_factor != 0.0f) {
			
 
				+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
			
 
				+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
			
 
				+
			
 
				+        // Get n-d magnitude scaling corrected for interpolation
			
 
				+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
			
 
				+    }
			
 
				+    *cos_theta = cos(theta) * mscale;
			
 
				+    *sin_theta = sin(theta) * mscale;
			
 
				+}
			
 
				+
			
 
				+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
			
 
				+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
			
 
				+static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
			
 
				+    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
			
 
				+}
			
 
				+
			
 
				+static void rope_yarn_corr_dims(
			
 
				+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
			
 
				+) {
			
 
				+    // start and end correction dims
			
 
				+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
			
 
				+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
			
 
				+}
			
 
				+
			
 
				+typedef void (rope_t)(
			
 
				+        device const    void * src0,
			
 
				+        device const int32_t * src1,
			
 
				+        device         float * dst,
			
 
				+        constant     int64_t & ne00,
			
 
				+        constant     int64_t & ne01,
			
 
				+        constant     int64_t & ne02,
			
 
				+        constant     int64_t & ne03,
			
 
				+        constant    uint64_t & nb00,
			
 
				+        constant    uint64_t & nb01,
			
 
				+        constant    uint64_t & nb02,
			
 
				+        constant    uint64_t & nb03,
			
 
				+        constant     int64_t & ne0,
			
 
				+        constant     int64_t & ne1,
			
 
				+        constant     int64_t & ne2,
			
 
				+        constant     int64_t & ne3,
			
 
				+        constant    uint64_t & nb0,
			
 
				+        constant    uint64_t & nb1,
			
 
				+        constant    uint64_t & nb2,
			
 
				+        constant    uint64_t & nb3,
			
 
				+        constant         int & n_past,
			
 
				+        constant         int & n_dims,
			
 
				+        constant         int & mode,
			
 
				+        constant         int & n_orig_ctx,
			
 
				+        constant       float & freq_base,
			
 
				+        constant       float & freq_scale,
			
 
				+        constant       float & ext_factor,
			
 
				+        constant       float & attn_factor,
			
 
				+        constant       float & beta_fast,
			
 
				+        constant       float & beta_slow,
			
 
				+        uint  tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint3 tptg[[threads_per_threadgroup]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]]);
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_rope(
			
 
				+        device const    void * src0,
			
 
				+        device const int32_t * src1,
			
 
				+        device         float * dst,
			
 
				+        constant     int64_t & ne00,
			
 
				+        constant     int64_t & ne01,
			
 
				+        constant     int64_t & ne02,
			
 
				+        constant     int64_t & ne03,
			
 
				+        constant    uint64_t & nb00,
			
 
				+        constant    uint64_t & nb01,
			
 
				+        constant    uint64_t & nb02,
			
 
				+        constant    uint64_t & nb03,
			
 
				+        constant     int64_t & ne0,
			
 
				+        constant     int64_t & ne1,
			
 
				+        constant     int64_t & ne2,
			
 
				+        constant     int64_t & ne3,
			
 
				+        constant    uint64_t & nb0,
			
 
				+        constant    uint64_t & nb1,
			
 
				+        constant    uint64_t & nb2,
			
 
				+        constant    uint64_t & nb3,
			
 
				+        constant         int & n_past,
			
 
				+        constant         int & n_dims,
			
 
				+        constant         int & mode,
			
 
				+        constant         int & n_orig_ctx,
			
 
				+        constant       float & freq_base,
			
 
				+        constant       float & freq_scale,
			
 
				+        constant       float & ext_factor,
			
 
				+        constant       float & attn_factor,
			
 
				+        constant       float & beta_fast,
			
 
				+        constant       float & beta_slow,
			
 
				+        uint  tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint3 tptg[[threads_per_threadgroup]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]]) {
			
 
				+    const int64_t i3 = tgpig[2];
			
 
				+    const int64_t i2 = tgpig[1];
			
 
				+    const int64_t i1 = tgpig[0];
			
 
				+
			
 
				+    const bool is_neox = mode & 2;
			
 
				+
			
 
				+    float corr_dims[2];
			
 
				+    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
			
 
				+
			
 
				+    device const int32_t * pos = src1;
			
 
				+
			
 
				+    const int64_t p = pos[i2];
			
 
				+
			
 
				+    const float theta_0 = (float)p;
			
 
				+    const float inv_ndims = -1.f/n_dims;
			
 
				+
			
 
				+    if (!is_neox) {
			
 
				+        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
			
 
				+
			
 
				+            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
			
 
				+            float cos_theta, sin_theta;
			
 
				+            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
			
 
				+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
			
 
				+
			
 
				+            const T x0 = src[0];
			
 
				+            const T x1 = src[1];
			
 
				+
			
 
				+            dst_data[0] = x0*cos_theta - x1*sin_theta;
			
 
				+            dst_data[1] = x0*sin_theta + x1*cos_theta;
			
 
				+        }
			
 
				+    } else {
			
 
				+        for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
			
 
				+            if (ic < n_dims) {
			
 
				+                const int64_t ib = 0;
			
 
				+
			
 
				+                // simplified from `(ib * n_dims + ic) * inv_ndims`
			
 
				+                const float cur_rot = inv_ndims*ic - ib;
			
 
				+
			
 
				+                const float theta = theta_0 * pow(freq_base, cur_rot);
			
 
				+                float cos_theta, sin_theta;
			
 
				+                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+                const int64_t i0 = ib*n_dims + ic/2;
			
 
				+
			
 
				+                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
			
 
				+                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
			
 
				+
			
 
				+                const float x0 = src[0];
			
 
				+                const float x1 = src[n_dims/2];
			
 
				+
			
 
				+                dst_data[0]        = x0*cos_theta - x1*sin_theta;
			
 
				+                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
			
 
				+            } else {
			
 
				+                const int64_t i0 = ic;
			
 
				+
			
 
				+                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
			
 
				+                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
			
 
				+
			
 
				+                dst_data[0] = src[0];
			
 
				+                dst_data[1] = src[1];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
			
 
				+template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
			
 
				+
			
 
				+typedef void (im2col_t)(
			
 
				+        device const float * x,
			
 
				+        device        char * dst,
			
 
				+        constant   int32_t & ofs0,
			
 
				+        constant   int32_t & ofs1,
			
 
				+        constant   int32_t & IW,
			
 
				+        constant   int32_t & IH,
			
 
				+        constant   int32_t & CHW,
			
 
				+        constant   int32_t & s0,
			
 
				+        constant   int32_t & s1,
			
 
				+        constant   int32_t & p0,
			
 
				+        constant   int32_t & p1,
			
 
				+        constant   int32_t & d0,
			
 
				+        constant   int32_t & d1,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3  tgpg[[threadgroups_per_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]);
			
 
				+
			
 
				+template <typename T>
			
 
				+kernel void kernel_im2col(
			
 
				+        device const float * x,
			
 
				+        device        char * dst,
			
 
				+        constant   int32_t & ofs0,
			
 
				+        constant   int32_t & ofs1,
			
 
				+        constant   int32_t & IW,
			
 
				+        constant   int32_t & IH,
			
 
				+        constant   int32_t & CHW,
			
 
				+        constant   int32_t & s0,
			
 
				+        constant   int32_t & s1,
			
 
				+        constant   int32_t & p0,
			
 
				+        constant   int32_t & p1,
			
 
				+        constant   int32_t & d0,
			
 
				+        constant   int32_t & d1,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3  tgpg[[threadgroups_per_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
			
 
				+    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
			
 
				+
			
 
				+    const int32_t offset_dst =
			
 
				+        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
			
 
				+        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
			
 
				+
			
 
				+    device T * pdst = (device T *) (dst);
			
 
				+
			
 
				+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
			
 
				+        pdst[offset_dst] = 0.0f;
			
 
				+    } else {
			
 
				+        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
			
 
				+        pdst[offset_dst] = x[offset_src + iih * IW + iiw];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
			
 
				+template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
			
 
				+
			
 
				+kernel void kernel_upscale_f32(
			
 
				+    device  const char * src0,
			
 
				+    device        char * dst,
			
 
				+    constant   int64_t & ne00,
			
 
				+    constant   int64_t & ne01,
			
 
				+    constant   int64_t & ne02,
			
 
				+    constant   int64_t & ne03,
			
 
				+    constant  uint64_t & nb00,
			
 
				+    constant  uint64_t & nb01,
			
 
				+    constant  uint64_t & nb02,
			
 
				+    constant  uint64_t & nb03,
			
 
				+    constant   int64_t & ne0,
			
 
				+    constant   int64_t & ne1,
			
 
				+    constant   int64_t & ne2,
			
 
				+    constant   int64_t & ne3,
			
 
				+    constant  uint64_t & nb0,
			
 
				+    constant  uint64_t & nb1,
			
 
				+    constant  uint64_t & nb2,
			
 
				+    constant  uint64_t & nb3,
			
 
				+    constant     float & sf0,
			
 
				+    constant     float & sf1,
			
 
				+    constant     float & sf2,
			
 
				+    constant     float & sf3,
			
 
				+    uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+    uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+    uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+
			
 
				+    const int64_t i3 = tgpig.z;
			
 
				+    const int64_t i2 = tgpig.y;
			
 
				+    const int64_t i1 = tgpig.x;
			
 
				+
			
 
				+    const int64_t i03 = i3/sf3;
			
 
				+    const int64_t i02 = i2/sf2;
			
 
				+    const int64_t i01 = i1/sf1;
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        const int64_t i00 = i0/sf0;
			
 
				+
			
 
				+        device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+        device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1  +  i0*nb0);
			
 
				+
			
 
				+        dst_ptr[0] = src0_ptr[0];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_pad_f32(
			
 
				+    device  const char * src0,
			
 
				+    device        char * dst,
			
 
				+    constant   int64_t & ne00,
			
 
				+    constant   int64_t & ne01,
			
 
				+    constant   int64_t & ne02,
			
 
				+    constant   int64_t & ne03,
			
 
				+    constant  uint64_t & nb00,
			
 
				+    constant  uint64_t & nb01,
			
 
				+    constant  uint64_t & nb02,
			
 
				+    constant  uint64_t & nb03,
			
 
				+    constant   int64_t & ne0,
			
 
				+    constant   int64_t & ne1,
			
 
				+    constant   int64_t & ne2,
			
 
				+    constant   int64_t & ne3,
			
 
				+    constant  uint64_t & nb0,
			
 
				+    constant  uint64_t & nb1,
			
 
				+    constant  uint64_t & nb2,
			
 
				+    constant  uint64_t & nb3,
			
 
				+    uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+    uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+    uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+
			
 
				+    const int64_t i3 = tgpig.z;
			
 
				+    const int64_t i2 = tgpig.y;
			
 
				+    const int64_t i1 = tgpig.x;
			
 
				+
			
 
				+    const int64_t i03 = i3;
			
 
				+    const int64_t i02 = i2;
			
 
				+    const int64_t i01 = i1;
			
 
				+
			
 
				+    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
			
 
				+    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
			
 
				+
			
 
				+    if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
			
 
				+        for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+            if (i0 < ne00) {
			
 
				+                dst_ptr[i0] = src0_ptr[i0];
			
 
				+            } else {
			
 
				+                dst_ptr[i0] = 0.0f;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        dst_ptr[i0] = 0.0f;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_arange_f32(
			
 
				+    device        char * dst,
			
 
				+    constant   int64_t & ne0,
			
 
				+    constant   float   & start,
			
 
				+    constant   float   & step,
			
 
				+    uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+    uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+    uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+
			
 
				+    device float * dst_ptr = (device float *) dst;
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        dst_ptr[i0] = start + step * i0;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_timestep_embedding_f32(
			
 
				+    device  const char * src0,
			
 
				+    device        char * dst,
			
 
				+    constant  uint64_t & nb1,
			
 
				+    constant  int      & dim,
			
 
				+    constant  int      & max_period,
			
 
				+    uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+    uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+    uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+
			
 
				+    int i = tgpig.x;
			
 
				+    device float * embed_data = (device float *)(dst +  i*nb1);
			
 
				+
			
 
				+    int half_ = dim / 2;
			
 
				+    for (int j = tpitg.x; j < half_; j += ntg.x) {
			
 
				+        float timestep = ((device float *)src0)[i];
			
 
				+        float freq = (float)exp(-log((float)max_period) * j / half_);
			
 
				+        float arg = timestep * freq;
			
 
				+        embed_data[j        ] = cos(arg);
			
 
				+        embed_data[j + half_] = sin(arg);
			
 
				+    }
			
 
				+
			
 
				+    if (dim % 2 != 0 && tpitg.x == 0) {
			
 
				+        embed_data[dim] = 0.f;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// bitonic sort implementation following the CUDA kernels as reference
			
 
				+typedef void (argsort_t)(
			
 
				+        device const float  * x,
			
 
				+        device     int32_t  * dst,
			
 
				+        constant   int64_t  & ncols,
			
 
				+        constant   int64_t  & ncols_pad,
			
 
				+        threadgroup int32_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]]);
			
 
				+
			
 
				+template<ggml_sort_order order>
			
 
				+kernel void kernel_argsort_f32_i32(
			
 
				+        device const float   * x,
			
 
				+        device       int32_t * dst,
			
 
				+        constant     int64_t & ncols,
			
 
				+        constant     int64_t & ncols_pad,
			
 
				+        threadgroup int32_t  * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]]) {
			
 
				+    // bitonic sort
			
 
				+    int col = tpitg[0];
			
 
				+    int row = tgpig[1];
			
 
				+
			
 
				+    if (col >= ncols_pad) return;
			
 
				+
			
 
				+    device const float   * x_row   = x + row * ncols;
			
 
				+    threadgroup int32_t  * dst_row = shared_values;
			
 
				+
			
 
				+    // initialize indices
			
 
				+    dst_row[col] = col;
			
 
				+
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    for (int k = 2; k <= ncols_pad; k *= 2) {
			
 
				+        for (int j = k / 2; j > 0; j /= 2) {
			
 
				+            int ixj = col ^ j;
			
 
				+            if (ixj > col) {
			
 
				+                if ((col & k) == 0) {
			
 
				+                    if (dst_row[col] >= ncols ||
			
 
				+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
			
 
				+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
			
 
				+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
			
 
				+                    ) {
			
 
				+                        SWAP(dst_row[col], dst_row[ixj]);
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    if (dst_row[ixj] >= ncols ||
			
 
				+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
			
 
				+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
			
 
				+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
			
 
				+                    ) {
			
 
				+                        SWAP(dst_row[col], dst_row[ixj]);
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+            threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // copy the result to dst without the padding
			
 
				+    if (col < ncols) {
			
 
				+        dst[row * ncols + col] = dst_row[col];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template [[host_name("kernel_argsort_f32_i32_asc")]]  kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
			
 
				+template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
			
 
				+
			
 
				+kernel void kernel_leaky_relu_f32(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant     float & slope,
			
 
				+        uint tpig[[thread_position_in_grid]]) {
			
 
				+    dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
			
 
				+}
			
 
				+
			
 
				+typedef void (flash_attn_ext_f16_t)(
			
 
				+        device const  char * q,
			
 
				+        device const  char * k,
			
 
				+        device const  char * v,
			
 
				+        device const  char * mask,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant   int64_t & ne13,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant  uint64_t & nb13,
			
 
				+        constant  uint64_t & nb21,
			
 
				+        constant  uint64_t & nb22,
			
 
				+        constant  uint64_t & nb23,
			
 
				+        constant  uint64_t & nb31,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant     float & scale,
			
 
				+        constant     float & max_bias,
			
 
				+        constant     float & m0,
			
 
				+        constant     float & m1,
			
 
				+        constant  uint32_t & n_head_log2,
			
 
				+        threadgroup   half * shared,
			
 
				+        uint3  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3  tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3    ntg[[threads_per_threadgroup]],
			
 
				+        ushort tiisg[[thread_index_in_simdgroup]],
			
 
				+        ushort sgitg[[simdgroup_index_in_threadgroup]]);
			
 
				+
			
 
				+// ref: https://arxiv.org/pdf/2307.08691.pdf
			
 
				+template<int64_t D, int64_t Q = 8, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
			
 
				+kernel void kernel_flash_attn_ext_f16(
			
 
				+        device const  char * q,
			
 
				+        device const  char * k,
			
 
				+        device const  char * v,
			
 
				+        device const  char * mask,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant   int64_t & ne13,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant  uint64_t & nb13,
			
 
				+        constant  uint64_t & nb21,
			
 
				+        constant  uint64_t & nb22,
			
 
				+        constant  uint64_t & nb23,
			
 
				+        constant  uint64_t & nb31,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant     float & scale,
			
 
				+        constant     float & max_bias,
			
 
				+        constant     float & m0,
			
 
				+        constant     float & m1,
			
 
				+        constant  uint32_t & n_head_log2,
			
 
				+        threadgroup   half * shared [[threadgroup(0)]],
			
 
				+        uint3  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3  tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3    ntg[[threads_per_threadgroup]],
			
 
				+        ushort tiisg[[thread_index_in_simdgroup]],
			
 
				+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    const short nsg = ntg.y; // number of simdgroups
			
 
				+
			
 
				+    const short iq3 = tgpig[2];
			
 
				+    const short iq2 = tgpig[1];
			
 
				+    const short iq1 = tgpig[0]*Q;
			
 
				+
			
 
				+    const short D4 = D/4;
			
 
				+    const short D8 = D/8;
			
 
				+  //const short Q8 = Q/8;
			
 
				+    const short NW = N_SIMDWIDTH;
			
 
				+    const short SH = (C + Q); // shared memory per simdgroup in (half)
			
 
				+
			
 
				+    const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
			
 
				+    const short TF = T/2;        // shared memory size per query in (float)
			
 
				+    const short T4 = T/4;        // shared memory size per query in (half4)
			
 
				+
			
 
				+    threadgroup half  * sq  = (threadgroup half  *) (shared +              0*D); // holds the query data
			
 
				+    threadgroup half4 * sq4 = (threadgroup half4 *) (shared +              0*D); // same as above but in half4
			
 
				+    threadgroup float * ss  = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
			
 
				+
			
 
				+    // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
			
 
				+    simdgroup_half8x8 lo[D8];
			
 
				+
			
 
				+    // load heads from Q to shared memory
			
 
				+    for (short j = sgitg; j < Q; j += nsg) {
			
 
				+        device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
			
 
				+
			
 
				+        for (short i = tiisg; i < D4; i += NW) {
			
 
				+            if (iq1 + j < ne01) {
			
 
				+                sq4[j*T4 + i] = (half4) q4[i];
			
 
				+            } else {
			
 
				+                sq4[j*T4 + i] = 0.0h;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // zero out lo
			
 
				+    for (short i = 0; i < D8; ++i) {
			
 
				+        lo[i] = make_filled_simdgroup_matrix<half, 8>(0.0h);
			
 
				+    }
			
 
				+
			
 
				+    // zero out shared memory SH
			
 
				+    for (short j = 0; j < Q; ++j) {
			
 
				+        for (short i = tiisg; i < SH; i += NW) {
			
 
				+            ss[j*TF + i] = 0.0f;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    {
			
 
				+        float S[Q] = { [0 ... Q-1] = 0.0h };
			
 
				+        float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 };
			
 
				+
			
 
				+        // assume K and V are same shape
			
 
				+        const short ne22 = ne12;
			
 
				+        const short ne23 = ne13;
			
 
				+
			
 
				+        // broadcast
			
 
				+        const short rk2 = ne02/ne12;
			
 
				+        const short rk3 = ne03/ne13;
			
 
				+
			
 
				+        const short rv2 = ne02/ne22;
			
 
				+        const short rv3 = ne03/ne23;
			
 
				+
			
 
				+        // k indices
			
 
				+        const short ik2 = iq2/rk2;
			
 
				+        const short ik3 = iq3/rk3;
			
 
				+
			
 
				+        // v indices
			
 
				+        const short iv2 = iq2/rv2;
			
 
				+        const short iv3 = iq3/rv3;
			
 
				+
			
 
				+        // load the queries from shared memory into local memory
			
 
				+        simdgroup_half8x8 mq[D8];
			
 
				+
			
 
				+        for (short i = 0; i < D8; ++i) {
			
 
				+            simdgroup_load(mq[i], sq + i*8, T);
			
 
				+        }
			
 
				+
			
 
				+        // pointer to the mask
			
 
				+        device const half * mp = (device const half *) (mask + iq1*nb31);
			
 
				+
			
 
				+        // prepare diagonal scale matrix
			
 
				+        simdgroup_float8x8 mscale(scale);
			
 
				+
			
 
				+        // prepare diagonal slope matrix
			
 
				+        simdgroup_float8x8 mslope(1.0f);
			
 
				+
			
 
				+        // ALiBi
			
 
				+        if (max_bias > 0.0f) {
			
 
				+            const uint32_t h = iq2;
			
 
				+
			
 
				+            const float base = h < n_head_log2 ? m0 : m1;
			
 
				+            const int   exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
			
 
				+
			
 
				+            mslope = simdgroup_float8x8(pow(base, exph));
			
 
				+        }
			
 
				+
			
 
				+        // loop over the KV cache
			
 
				+        // each simdgroup handles blocks of Q rows and C columns
			
 
				+        for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
			
 
				+            const int ic = ic0 + C*sgitg;
			
 
				+            if (ic >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            // Q*K^T
			
 
				+            {
			
 
				+                for (short cc = 0; cc < C/8; ++cc) {
			
 
				+                    simdgroup_float8x8 mqk = make_filled_simdgroup_matrix<float, 8>(0.h);
			
 
				+
			
 
				+                    device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
			
 
				+
			
 
				+                    for (short i = 0; i < D8; ++i) {
			
 
				+                        simdgroup_half8x8 mk;
			
 
				+                        simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
			
 
				+
			
 
				+                        simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
			
 
				+                    }
			
 
				+
			
 
				+                    if (mask != q) {
			
 
				+                        // mqk = mqk*scale + mask*slope
			
 
				+                        simdgroup_half8x8 mm;
			
 
				+                        simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
			
 
				+                        simdgroup_multiply(mm, mslope, mm);
			
 
				+                        simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
			
 
				+                    } else {
			
 
				+                        // mqk = mqk*scale
			
 
				+                        simdgroup_multiply(mqk, mscale, mqk);
			
 
				+                    }
			
 
				+
			
 
				+                    simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // used to detect blocks full of -INF
			
 
				+            float smax = -INFINITY;
			
 
				+
			
 
				+            // online softmax
			
 
				+            {
			
 
				+                float ms[Q];
			
 
				+
			
 
				+                for (short j = 0; j < Q; ++j) {
			
 
				+                    const short p = tiisg;
			
 
				+
			
 
				+                    const float m = M[j];
			
 
				+                    const float s = ss[j*TF + p];
			
 
				+
			
 
				+                    smax = simd_max(max(smax, s));
			
 
				+                    M[j] = simd_max(max(M[j], s));
			
 
				+
			
 
				+                                ms[j] = exp(m - M[j]);
			
 
				+                    const float vs    = exp(s - M[j]);
			
 
				+
			
 
				+                    S[j] = S[j]*ms[j] + simd_sum(vs);
			
 
				+
			
 
				+                    // the P matrix from the paper (Q rows, C columns)
			
 
				+                    ss[j*TF + p] = vs;
			
 
				+                }
			
 
				+
			
 
				+                // create a QxQ diagonal matrix for rescaling the output
			
 
				+                if (tiisg < Q) {
			
 
				+                    ss[tiisg*TF + C + tiisg] = ms[tiisg];
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // skip -INF blocks
			
 
				+            if (smax == -INFINITY) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            // O = diag(ms)*O
			
 
				+            {
			
 
				+                simdgroup_float8x8 mm;
			
 
				+                simdgroup_load(mm, ss + C, TF, 0, false);
			
 
				+
			
 
				+                for (short i = 0; i < D8; ++i) {
			
 
				+                    simdgroup_multiply(lo[i], mm, lo[i]);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // O = O + (Q*K^T)*V
			
 
				+            {
			
 
				+                for (short cc = 0; cc < C/8; ++cc) {
			
 
				+                    device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
			
 
				+
			
 
				+                    for (short i = 0; i < D8; ++i) {
			
 
				+                        simdgroup_half8x8 mk;
			
 
				+                        simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false);
			
 
				+
			
 
				+                        simdgroup_float8x8 mv;
			
 
				+                        simdgroup_load(mv, ss + 8*cc, TF, 0, false);
			
 
				+
			
 
				+                        simdgroup_multiply_accumulate(lo[i], mv, mk, lo[i]);
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
			
 
				+        for (short j = 0; j < Q; ++j) {
			
 
				+            if (tiisg == 0) {
			
 
				+                ss[j*TF + 0] = S[j];
			
 
				+                ss[j*TF + 1] = M[j];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // reduce the warps sequentially
			
 
				+    for (short sg = 1; sg < nsg; ++sg) {
			
 
				+        float S = { 0.0h };
			
 
				+        float M = { -FLT_MAX/2 };
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        // each simdgroup stores its output to shared memory, reusing sq
			
 
				+        if (sgitg == sg) {
			
 
				+            for (short i = 0; i < D8; ++i) {
			
 
				+                simdgroup_store(lo[i], sq + i*8, T, 0, false);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        // the first simdgroup accumulates the results from the other simdgroups
			
 
				+        if (sgitg == 0) {
			
 
				+            for (short j = 0; j < Q; ++j) {
			
 
				+                const float S0 = ss[j*TF +         0];
			
 
				+                const float S1 = ss[j*TF + sg*SH + 0];
			
 
				+
			
 
				+                const float M0 = ss[j*TF +         1];
			
 
				+                const float M1 = ss[j*TF + sg*SH + 1];
			
 
				+
			
 
				+                M = max(M0, M1);
			
 
				+
			
 
				+                const float ms0 = exp(M0 - M);
			
 
				+                const float ms1 = exp(M1 - M);
			
 
				+
			
 
				+                S = S0*ms0 + S1*ms1;
			
 
				+
			
 
				+                if (tiisg == 0) {
			
 
				+                    ss[j*TF + 0] = S;
			
 
				+                    ss[j*TF + 1] = M;
			
 
				+
			
 
				+                    ss[j*TF + C + j        ] = ms0;
			
 
				+                    ss[j*TF + C + j + sg*SH] = ms1;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
			
 
				+            {
			
 
				+                simdgroup_half8x8 t;
			
 
				+                simdgroup_float8x8 ms0;
			
 
				+                simdgroup_float8x8 ms1;
			
 
				+
			
 
				+                simdgroup_load(ms0, ss + C,         TF, 0, false);
			
 
				+                simdgroup_load(ms1, ss + C + sg*SH, TF, 0, false);
			
 
				+
			
 
				+                for (short i = 0; i < D8; ++i) {
			
 
				+                    simdgroup_load    (t, sq + i*8, T, 0, false);
			
 
				+                    simdgroup_multiply(t, ms1, t);
			
 
				+
			
 
				+                    simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // store result to shared memory (reuse sq)
			
 
				+    if (sgitg == 0) {
			
 
				+        for (short i = 0; i < D8; ++i) {
			
 
				+            simdgroup_store(lo[i], sq + i*8, T, 0, false);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    device float4 * dst4 = (device float4 *) dst;
			
 
				+
			
 
				+    // final rescale with 1/S and store to global memory
			
 
				+    if (sgitg == 0) {
			
 
				+        for (short j = 0; j < Q && iq1 + j < ne01; ++j) {
			
 
				+            const float S = ss[j*TF + 0];
			
 
				+
			
 
				+            for (short i = tiisg; i < D4; i += NW) {
			
 
				+                dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>;
			
 
				+template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>;
			
 
				+template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
			
 
				+template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
			
 
				+template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
			
 
				+template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
			
 
				+
			
 
				+template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
			
 
				+kernel void kernel_flash_attn_ext_vec_f16(
			
 
				+        device const  char * q,
			
 
				+        device const  char * k,
			
 
				+        device const  char * v,
			
 
				+        device const  char * mask,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant   int64_t & ne13,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant  uint64_t & nb13,
			
 
				+        constant  uint64_t & nb21,
			
 
				+        constant  uint64_t & nb22,
			
 
				+        constant  uint64_t & nb23,
			
 
				+        constant  uint64_t & nb31,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant     float & scale,
			
 
				+        constant     float & max_bias,
			
 
				+        constant     float & m0,
			
 
				+        constant     float & m1,
			
 
				+        constant  uint32_t & n_head_log2,
			
 
				+        threadgroup   half * shared [[threadgroup(0)]],
			
 
				+        uint3  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3  tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3    ntg[[threads_per_threadgroup]],
			
 
				+        ushort tiisg[[thread_index_in_simdgroup]],
			
 
				+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    const short nsg = ntg.y; // number of simdgroups
			
 
				+
			
 
				+    const short iq3 = tgpig[2];
			
 
				+    const short iq2 = tgpig[1];
			
 
				+    const short iq1 = tgpig[0];
			
 
				+
			
 
				+    const short D4 = D/4;
			
 
				+    const short NW = N_SIMDWIDTH;
			
 
				+    const short SH = (C + Q); // shared memory per simdgroup in (half)
			
 
				+
			
 
				+    const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
			
 
				+
			
 
				+    float slope = 1.0f;
			
 
				+
			
 
				+    // ALiBi
			
 
				+    if (max_bias > 0.0f) {
			
 
				+        const uint32_t h = iq2;
			
 
				+
			
 
				+        const float base = h < n_head_log2 ? m0 : m1;
			
 
				+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
			
 
				+
			
 
				+        slope = pow(base, exp);
			
 
				+    }
			
 
				+
			
 
				+  //threadgroup half   * sq  = (threadgroup half   *) (shared +              0*D); // holds the query data
			
 
				+    threadgroup half4  * sq4 = (threadgroup half4  *) (shared +              0*D); // same as above but in half4
			
 
				+    threadgroup float  * ss  = (threadgroup float  *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
			
 
				+    threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4
			
 
				+    threadgroup half4  * sr4 = (threadgroup half4  *) (shared +   sgitg*D  + 1*T); // scratch buffer for the results
			
 
				+
			
 
				+    // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
			
 
				+    half4 lo[D4/NW];
			
 
				+
			
 
				+    // load heads from Q to shared memory
			
 
				+    device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03));
			
 
				+
			
 
				+    for (short i = tiisg; i < D4; i += NW) {
			
 
				+        if (iq1 < ne01) {
			
 
				+            sq4[i] = (half4) q4[i];
			
 
				+        } else {
			
 
				+            sq4[i] = 0.0h;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // zero out lo
			
 
				+    for (short i = tiisg; i < D4; i += NW) {
			
 
				+        lo[i/NW] = 0.0h;
			
 
				+    }
			
 
				+
			
 
				+    // zero out shared memory SH
			
 
				+    for (short i = tiisg; i < SH/4; i += NW) {
			
 
				+        ss4[i] = 0.0h;
			
 
				+    }
			
 
				+
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    {
			
 
				+        float S = { 0.0h };
			
 
				+        float M = { -FLT_MAX/2 };
			
 
				+
			
 
				+        // assume K and V are same shape
			
 
				+        const short ne22 = ne12;
			
 
				+        const short ne23 = ne13;
			
 
				+
			
 
				+        // broadcast
			
 
				+        const short rk2 = ne02/ne12;
			
 
				+        const short rk3 = ne03/ne13;
			
 
				+
			
 
				+        const short rv2 = ne02/ne22;
			
 
				+        const short rv3 = ne03/ne23;
			
 
				+
			
 
				+        // k indices
			
 
				+        const short ik2 = iq2 / rk2;
			
 
				+        const short ik3 = iq3 / rk3;
			
 
				+
			
 
				+        // v indices
			
 
				+        const short iv2 = iq2 / rv2;
			
 
				+        const short iv3 = iq3 / rv3;
			
 
				+
			
 
				+        // load the queries from shared memory into local memory
			
 
				+        half4 mq[D4];
			
 
				+
			
 
				+        for (short ii = 0; ii < D4; ii += NW) {
			
 
				+            short i = ii + tiisg;
			
 
				+            mq[i] = sq4[i];
			
 
				+        }
			
 
				+
			
 
				+        // pointer to the mask
			
 
				+        device const half4 * mp4 = (device const half4 *) (mask + iq1*nb31);
			
 
				+
			
 
				+        // loop over the KV cache
			
 
				+        // each simdgroup handles blocks of Q rows and C columns
			
 
				+        for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
			
 
				+            const int ic = ic0 + C*sgitg;
			
 
				+            if (ic >= ne11) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            // Q*K^T
			
 
				+            {
			
 
				+#pragma unroll
			
 
				+                for (short cc = 0; cc < C/4; ++cc) {
			
 
				+                    float4 mqk = { 0.0h };
			
 
				+
			
 
				+                    device const half4 * pk4 = (device const half4 *) ((device const char *) k + ((ic + 4*cc)*nb11 + ik2*nb12 + ik3*nb13));
			
 
				+
			
 
				+#pragma unroll
			
 
				+                    for (short ii = 0; ii < D4; ii += NW) {
			
 
				+                        const short i = ii + tiisg;
			
 
				+
			
 
				+                        half4x4 mk;
			
 
				+                        mk[0] = pk4[i + 0*(nb11/8)];
			
 
				+                        mk[1] = pk4[i + 1*(nb11/8)];
			
 
				+                        mk[2] = pk4[i + 2*(nb11/8)];
			
 
				+                        mk[3] = pk4[i + 3*(nb11/8)];
			
 
				+
			
 
				+                        mqk += (float4) (mq[i] * mk);
			
 
				+                    }
			
 
				+
			
 
				+                    // reduce the results from the threads in the simdgroup
			
 
				+                    mqk += simd_shuffle_down(mqk, 16);
			
 
				+                    mqk += simd_shuffle_down(mqk,  8);
			
 
				+                    mqk += simd_shuffle_down(mqk,  4);
			
 
				+                    mqk += simd_shuffle_down(mqk,  2);
			
 
				+                    mqk += simd_shuffle_down(mqk,  1);
			
 
				+
			
 
				+                    // mqk = mqk*scale + mask*slope
			
 
				+                    if (tiisg == 0) {
			
 
				+                        mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
			
 
				+
			
 
				+                        ss4[cc] = mqk;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // online softmax
			
 
				+            {
			
 
				+                const short p = tiisg;
			
 
				+
			
 
				+                const float m = M;
			
 
				+                const float s = ss[p];
			
 
				+
			
 
				+                M = simd_max(max(M, s));
			
 
				+
			
 
				+                const float ms = exp(m - M);
			
 
				+                const float vs = exp(s - M);
			
 
				+
			
 
				+                S = S*ms + simd_sum(vs);
			
 
				+
			
 
				+                // the P matrix from the paper (Q rows, C columns)
			
 
				+                ss[p] = vs;
			
 
				+
			
 
				+                // O = diag(ms)*O
			
 
				+#pragma unroll
			
 
				+                for (short ii = 0; ii < D4; ii += NW) {
			
 
				+                    const short i = ii + tiisg;
			
 
				+                    lo[i/NW] *= ms;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // O = O + (Q*K^T)*V
			
 
				+            {
			
 
				+#pragma unroll
			
 
				+                for (short cc = 0; cc < C/4; ++cc) {
			
 
				+                    device const half4 * pv4 = (device const half4 *) ((device const char *) v + ((ic + 4*cc)*nb21 + iv2*nb22 + iv3*nb23));
			
 
				+
			
 
				+#pragma unroll
			
 
				+                    for (short ii = 0; ii < D4; ii += NW) {
			
 
				+                        const short i = ii + tiisg;
			
 
				+
			
 
				+                        lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0];
			
 
				+                        lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1];
			
 
				+                        lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2];
			
 
				+                        lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3];
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        // these are needed for reducing the results from the simdgroups (reuse the ss buffer)
			
 
				+        if (tiisg == 0) {
			
 
				+            ss[0] = S;
			
 
				+            ss[1] = M;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // store results to shared memory
			
 
				+    for (short ii = 0; ii < D4; ii += NW) {
			
 
				+        short i = ii + tiisg;
			
 
				+        sr4[i] = lo[ii/NW];
			
 
				+    }
			
 
				+
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    // parallel reduce
			
 
				+    for (short r = nsg/2; r > 0; r >>= 1) {
			
 
				+        if (sgitg < r) {
			
 
				+            const float S0 = ss[       0];
			
 
				+            const float S1 = ss[r*SH + 0];
			
 
				+
			
 
				+            const float M0 = ss[       1];
			
 
				+            const float M1 = ss[r*SH + 1];
			
 
				+
			
 
				+            const float M = max(M0, M1);
			
 
				+
			
 
				+            const float ms0 = exp(M0 - M);
			
 
				+            const float ms1 = exp(M1 - M);
			
 
				+
			
 
				+            const float S = S0*ms0 + S1*ms1;
			
 
				+
			
 
				+            if (tiisg == 0) {
			
 
				+                ss[0] = S;
			
 
				+                ss[1] = M;
			
 
				+            }
			
 
				+
			
 
				+            // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
			
 
				+            for (short ii = 0; ii < D4; ii += NW) {
			
 
				+                short i = ii + tiisg;
			
 
				+                sr4[i] = sr4[i]*ms0 + sr4[i + r*D4]*ms1;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+
			
 
				+    device float4 * dst4 = (device float4 *) dst;
			
 
				+
			
 
				+    // final rescale with 1/S and store to global memory
			
 
				+    if (sgitg == 0) {
			
 
				+        const float S = ss[0];
			
 
				+
			
 
				+        for (short ii = 0; ii < D4; ii += NW) {
			
 
				+            short i = ii + tiisg;
			
 
				+            dst4[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D4 + i] = (float4) sr4[i]/S;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
			
 
				+template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
			
 
				+
			
 
				+kernel void kernel_cpy_f16_f16(
			
 
				+        device  const half * src0,
			
 
				+        device        half * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
			
 
				+
			
 
				+    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
			
 
				+        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+        dst_data[i00] = src[0];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f16_f32(
			
 
				+        device  const half * src0,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
			
 
				+
			
 
				+    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
			
 
				+        device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+        dst_data[i00] = src[0];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_f16(
			
 
				+        device const float * src0,
			
 
				+        device        half * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
			
 
				+
			
 
				+    device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        // TODO: is there a better way to handle -INFINITY?
			
 
				+        dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_f32(
			
 
				+        device const float * src0,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
			
 
				+
			
 
				+    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        dst_data[i00] = src[0];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_q8_0(
			
 
				+        device const float * src0,
			
 
				+        device        void * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
			
 
				+
			
 
				+    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        float amax = 0.0f; // absolute max
			
 
				+
			
 
				+        for (int j = 0; j < QK8_0; j++) {
			
 
				+            const float v = src[j];
			
 
				+            amax = MAX(amax, fabs(v));
			
 
				+        }
			
 
				+
			
 
				+        const float d = amax / ((1 << 7) - 1);
			
 
				+        const float id = d ? 1.0f/d : 0.0f;
			
 
				+
			
 
				+        dst_data[i00/QK8_0].d = d;
			
 
				+
			
 
				+        for (int j = 0; j < QK8_0; ++j) {
			
 
				+            const float x0 = src[j]*id;
			
 
				+
			
 
				+            dst_data[i00/QK8_0].qs[j] = round(x0);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_q4_0(
			
 
				+        device const float * src0,
			
 
				+        device        void * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
			
 
				+
			
 
				+    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        float amax = 0.0f; // absolute max
			
 
				+        float max  = 0.0f;
			
 
				+
			
 
				+        for (int j = 0; j < QK4_0; j++) {
			
 
				+            const float v = src[j];
			
 
				+            if (amax < fabs(v)) {
			
 
				+                amax = fabs(v);
			
 
				+                max  = v;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        const float d = max / -8;
			
 
				+        const float id = d ? 1.0f/d : 0.0f;
			
 
				+
			
 
				+        dst_data[i00/QK4_0].d = d;
			
 
				+
			
 
				+        for (int j = 0; j < QK4_0/2; ++j) {
			
 
				+            const float x0 = src[0       + j]*id;
			
 
				+            const float x1 = src[QK4_0/2 + j]*id;
			
 
				+
			
 
				+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
			
 
				+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
			
 
				+
			
 
				+            dst_data[i00/QK4_0].qs[j]  = xi0;
			
 
				+            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_q4_1(
			
 
				+        device const float * src0,
			
 
				+        device        void * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
			
 
				+
			
 
				+    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        float min = FLT_MAX;
			
 
				+        float max = -FLT_MAX;
			
 
				+
			
 
				+        for (int j = 0; j < QK4_1; j++) {
			
 
				+            const float v = src[j];
			
 
				+            if (min > v) min = v;
			
 
				+            if (max < v) max = v;
			
 
				+        }
			
 
				+
			
 
				+        const float d = (max - min) / ((1 << 4) - 1);
			
 
				+        const float id = d ? 1.0f/d : 0.0f;
			
 
				+
			
 
				+        dst_data[i00/QK4_1].d = d;
			
 
				+        dst_data[i00/QK4_1].m = min;
			
 
				+
			
 
				+        for (int j = 0; j < QK4_1/2; ++j) {
			
 
				+            const float x0 = (src[0       + j] - min)*id;
			
 
				+            const float x1 = (src[QK4_1/2 + j] - min)*id;
			
 
				+
			
 
				+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
			
 
				+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
			
 
				+
			
 
				+            dst_data[i00/QK4_1].qs[j]  = xi0;
			
 
				+            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_q5_0(
			
 
				+        device const float * src0,
			
 
				+        device        void * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK5_0;
			
 
				+
			
 
				+    device block_q5_0 * dst_data = (device block_q5_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x*QK5_0; i00 < ne00; i00 += ntg.x*QK5_0) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        float amax = 0.0f; // absolute max
			
 
				+        float max  = 0.0f;
			
 
				+
			
 
				+        for (int j = 0; j < QK5_0; j++) {
			
 
				+            const float v = src[j];
			
 
				+            if (amax < fabs(v)) {
			
 
				+                amax = fabs(v);
			
 
				+                max  = v;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        const float d = max / -16;
			
 
				+        const float id = d ? 1.0f/d : 0.0f;
			
 
				+
			
 
				+        dst_data[i00/QK5_0].d = d;
			
 
				+
			
 
				+        uint32_t qh = 0;
			
 
				+        for (int j = 0; j < QK5_0/2; ++j) {
			
 
				+            const float x0 = src[0       + j]*id;
			
 
				+            const float x1 = src[QK5_0/2 + j]*id;
			
 
				+
			
 
				+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
			
 
				+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
			
 
				+
			
 
				+            dst_data[i00/QK5_0].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
			
 
				+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
			
 
				+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
			
 
				+        }
			
 
				+        thread const uint8_t * qh8 = (thread const uint8_t *)&qh;
			
 
				+        for (int j = 0; j < 4; ++j) {
			
 
				+            dst_data[i00/QK5_0].qh[j] = qh8[j];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_cpy_f32_q5_1(
			
 
				+        device const float * src0,
			
 
				+        device        void * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK5_1;
			
 
				+
			
 
				+    device block_q5_1 * dst_data = (device block_q5_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x*QK5_1; i00 < ne00; i00 += ntg.x*QK5_1) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        float max = src[0];
			
 
				+        float min = src[0];
			
 
				+
			
 
				+        for (int j = 1; j < QK5_1; j++) {
			
 
				+            const float v = src[j];
			
 
				+            min = v < min ? v : min;
			
 
				+            max = v > max ? v : max;
			
 
				+        }
			
 
				+
			
 
				+        const float d = (max - min) / 31;
			
 
				+        const float id = d ? 1.0f/d : 0.0f;
			
 
				+
			
 
				+        dst_data[i00/QK5_1].d = d;
			
 
				+        dst_data[i00/QK5_1].m = min;
			
 
				+
			
 
				+        uint32_t qh = 0;
			
 
				+        for (int j = 0; j < QK5_1/2; ++j) {
			
 
				+            const float x0 = (src[0       + j] - min)*id;
			
 
				+            const float x1 = (src[QK5_1/2 + j] - min)*id;
			
 
				+
			
 
				+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
			
 
				+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
			
 
				+
			
 
				+            dst_data[i00/QK5_1].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
			
 
				+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
			
 
				+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
			
 
				+        }
			
 
				+        thread const uint8_t * qh8 = (thread const uint8_t *)&qh;
			
 
				+        for (int j = 0; j < 4; ++j) {
			
 
				+            dst_data[i00/QK5_1].qh[j] = qh8[j];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static inline int best_index_int8(int n, constant float * val, float x) {
			
 
				+    if (x <= val[0]) return 0;
			
 
				+    if (x >= val[n-1]) return n-1;
			
 
				+    int ml = 0, mu = n-1;
			
 
				+    while (mu-ml > 1) {
			
 
				+        int mav = (ml+mu)/2;
			
 
				+        if (x < val[mav]) mu = mav; else ml = mav;
			
 
				+    }
			
 
				+    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
			
 
				+}
			
 
				+
			
 
				+constexpr constant static float kvalues_iq4nl_f[16] = {
			
 
				+    -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
			
 
				+};
			
 
				+
			
 
				+kernel void kernel_cpy_f32_iq4_nl(
			
 
				+        device const float * src0,
			
 
				+        device        void * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne03,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant  uint64_t & nb03,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   int64_t & ne2,
			
 
				+        constant   int64_t & ne3,
			
 
				+        constant  uint64_t & nb0,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        constant  uint64_t & nb3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+        uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+    const int64_t i03 = tgpig[2];
			
 
				+    const int64_t i02 = tgpig[1];
			
 
				+    const int64_t i01 = tgpig[0];
			
 
				+
			
 
				+    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
			
 
				+
			
 
				+    const int64_t i3 = n / (ne2*ne1*ne0);
			
 
				+    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
			
 
				+    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
			
 
				+    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_NL;
			
 
				+
			
 
				+    device block_iq4_nl * dst_data = (device block_iq4_nl *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
			
 
				+
			
 
				+    for (int64_t i00 = tpitg.x*QK4_NL; i00 < ne00; i00 += ntg.x*QK4_NL) {
			
 
				+        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
			
 
				+
			
 
				+        float amax = 0.0f; // absolute max
			
 
				+        float max  = 0.0f;
			
 
				+
			
 
				+        for (int j = 0; j < QK4_0; j++) {
			
 
				+            const float v = src[j];
			
 
				+            if (amax < fabs(v)) {
			
 
				+                amax = fabs(v);
			
 
				+                max  = v;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        const float d = max / kvalues_iq4nl_f[0];
			
 
				+        const float id = d ? 1.0f/d : 0.0f;
			
 
				+
			
 
				+        float sumqx = 0, sumq2 = 0;
			
 
				+        for (int j = 0; j < QK4_NL/2; ++j) {
			
 
				+            const float x0 = src[0        + j]*id;
			
 
				+            const float x1 = src[QK4_NL/2 + j]*id;
			
 
				+
			
 
				+            const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0);
			
 
				+            const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1);
			
 
				+
			
 
				+            dst_data[i00/QK4_NL].qs[j] = xi0 | (xi1 << 4);
			
 
				+
			
 
				+            const float v0 = kvalues_iq4nl_f[xi0];
			
 
				+            const float v1 = kvalues_iq4nl_f[xi1];
			
 
				+            const float w0 = src[0        + j]*src[0        + j];
			
 
				+            const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j];
			
 
				+            sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j];
			
 
				+            sumq2 += w0*v0*v0 + w1*v1*v1;
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        dst_data[i00/QK4_NL].d = sumq2 > 0 ? sumqx/sumq2 : d;
			
 
				+
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_concat(
			
 
				+    device  const char * src0,
			
 
				+    device  const char * src1,
			
 
				+    device        char * dst,
			
 
				+    constant   int64_t & ne00,
			
 
				+    constant   int64_t & ne01,
			
 
				+    constant   int64_t & ne02,
			
 
				+    constant   int64_t & ne03,
			
 
				+    constant  uint64_t & nb00,
			
 
				+    constant  uint64_t & nb01,
			
 
				+    constant  uint64_t & nb02,
			
 
				+    constant  uint64_t & nb03,
			
 
				+    constant   int64_t & ne10,
			
 
				+    constant   int64_t & ne11,
			
 
				+    constant   int64_t & ne12,
			
 
				+    constant   int64_t & ne13,
			
 
				+    constant  uint64_t & nb10,
			
 
				+    constant  uint64_t & nb11,
			
 
				+    constant  uint64_t & nb12,
			
 
				+    constant  uint64_t & nb13,
			
 
				+    constant   int64_t & ne0,
			
 
				+    constant   int64_t & ne1,
			
 
				+    constant   int64_t & ne2,
			
 
				+    constant   int64_t & ne3,
			
 
				+    constant  uint64_t & nb0,
			
 
				+    constant  uint64_t & nb1,
			
 
				+    constant  uint64_t & nb2,
			
 
				+    constant  uint64_t & nb3,
			
 
				+    uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+    uint3 tpitg[[thread_position_in_threadgroup]],
			
 
				+    uint3   ntg[[threads_per_threadgroup]]) {
			
 
				+
			
 
				+    const int64_t i03 = tgpig.z;
			
 
				+    const int64_t i02 = tgpig.y;
			
 
				+    const int64_t i01 = tgpig.x;
			
 
				+
			
 
				+    const int64_t i13 = i03 % ne13;
			
 
				+    const int64_t i12 = i02 % ne12;
			
 
				+    const int64_t i11 = i01 % ne11;
			
 
				+
			
 
				+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
			
 
				+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
			
 
				+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
			
 
				+
			
 
				+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
			
 
				+        if (i02 < ne02) {
			
 
				+            ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
			
 
				+            src0_ptr += ntg.x*nb00;
			
 
				+        } else {
			
 
				+            ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
			
 
				+            src1_ptr += ntg.x*nb10;
			
 
				+        }
			
 
				+        dst_ptr += ntg.x*nb0;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_q2_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0;
			
 
				+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int step = sizeof(block_q2_K) * nb;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    const int ix = tiisg/8;  // 0...3
			
 
				+    const int it = tiisg%8;  // 0...7
			
 
				+    const int iq = it/4;     // 0 or 1
			
 
				+    const int ir = it%4;     // 0...3
			
 
				+    const int is = (8*ir)/16;// 0 or 1
			
 
				+
			
 
				+    device const float * y4 = y + ix * QK_K + 128 * iq + 8 * ir;
			
 
				+
			
 
				+    for (int ib = ix; ib < nb; ib += 4) {
			
 
				+
			
 
				+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
			
 
				+        for (int i = 0; i < 8; ++i) {
			
 
				+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
			
 
				+            yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8];
			
 
				+            yl[i+16] = y4[i+64]; sumy[2] += yl[i+16];
			
 
				+            yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
			
 
				+        }
			
 
				+
			
 
				+        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales + 8*iq + is;
			
 
				+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
			
 
				+        device const half     * dh = &x[ib].d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
			
 
				+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
			
 
				+            for (int i = 0; i < 8; i += 2) {
			
 
				+                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
			
 
				+                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
			
 
				+                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
			
 
				+                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
			
 
				+                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
			
 
				+                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
			
 
				+                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
			
 
				+                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
			
 
				+            }
			
 
				+            float dall = dh[0];
			
 
				+            float dmin = dh[1] * 1.f/16.f;
			
 
				+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
			
 
				+                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f +
			
 
				+                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f +
			
 
				+                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
			
 
				+                         dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
			
 
				+
			
 
				+            qs += step/2;
			
 
				+            sc += step;
			
 
				+            dh += step/2;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 4 * QK_K;
			
 
				+    }
			
 
				+#else
			
 
				+    const int ix = tiisg/2;  // 0...15
			
 
				+    const int it = tiisg%2;  // 0...1
			
 
				+
			
 
				+    device const float * y4 = y + ix * QK_K + 8 * it;
			
 
				+
			
 
				+    for (int ib = ix; ib < nb; ib += 16) {
			
 
				+
			
 
				+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
			
 
				+        for (int i = 0; i < 8; ++i) {
			
 
				+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
			
 
				+            yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
			
 
				+            yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
			
 
				+            yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
			
 
				+        }
			
 
				+
			
 
				+        device const uint8_t  * sc = (device const uint8_t  *)x[ib].scales;
			
 
				+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
			
 
				+        device const half     * dh = &x[ib].d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
			
 
				+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
			
 
				+            for (int i = 0; i < 8; i += 2) {
			
 
				+                acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
			
 
				+                acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
			
 
				+                acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
			
 
				+                acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
			
 
				+                acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
			
 
				+                acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
			
 
				+                acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
			
 
				+                acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
			
 
				+            }
			
 
				+
			
 
				+            float dall = dh[0];
			
 
				+            float dmin = dh[1];
			
 
				+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
			
 
				+                                 (acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
			
 
				+                                 (acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
			
 
				+                                 (acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
			
 
				+                         dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
			
 
				+
			
 
				+            qs += step/2;
			
 
				+            sc += step;
			
 
				+            dh += step/2;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 16 * QK_K;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_q2_K_f32")]]
			
 
				+kernel void kernel_mul_mv_q2_K_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+void kernel_mul_mv_q3_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t r1 = tgpig.y;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0;
			
 
				+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+
			
 
				+    //const uint16_t kmask1 = 0x3030;
			
 
				+    //const uint16_t kmask2 = 0x0f0f;
			
 
				+
			
 
				+    const int tid = tiisg/4;
			
 
				+    const int ix  = tiisg%4;
			
 
				+    const int ip  = tid/4;          // 0 or 1
			
 
				+    const int il  = 2*((tid%4)/2);  // 0 or 2
			
 
				+    const int ir  = tid%2;
			
 
				+    const int n   = 8;
			
 
				+    const int l0  = n*ir;
			
 
				+
			
 
				+    // One would think that the Metal compiler would figure out that ip and il can only have
			
 
				+    // 4 possible states, and optimize accordingly. Well, no. It needs help, and we do it
			
 
				+    // with these two tales.
			
 
				+    //
			
 
				+    // Possible masks for the high bit
			
 
				+    const ushort4 mm[4] = {{0x0001, 0x0100, 0x0002, 0x0200},  // ip = 0, il = 0
			
 
				+                           {0x0004, 0x0400, 0x0008, 0x0800},  // ip = 0, il = 2
			
 
				+                           {0x0010, 0x1000, 0x0020, 0x2000},  // ip = 1, il = 0
			
 
				+                           {0x0040, 0x4000, 0x0080, 0x8000}}; // ip = 1, il = 2
			
 
				+
			
 
				+    // Possible masks for the low 2 bits
			
 
				+    const int4 qm[2] = {{0x0003, 0x0300, 0x000c, 0x0c00}, {0x0030, 0x3000, 0x00c0, 0xc000}};
			
 
				+
			
 
				+    const ushort4 hm = mm[2*ip + il/2];
			
 
				+
			
 
				+    const int shift = 2*il;
			
 
				+    const float    v1 = il == 0 ? 4.f : 64.f;
			
 
				+    const float    v2 = 4.f * v1;
			
 
				+
			
 
				+    const uint16_t s_shift1 = 4*ip;
			
 
				+    const uint16_t s_shift2 = s_shift1 + il;
			
 
				+
			
 
				+    const int q_offset = 32*ip + l0;
			
 
				+    const int y_offset = 128*ip + 32*il + l0;
			
 
				+
			
 
				+    const int step = sizeof(block_q3_K) * nb / 2;
			
 
				+
			
 
				+    device const float * y1 = yy + ix*QK_K + y_offset;
			
 
				+
			
 
				+    uint32_t scales32, aux32;
			
 
				+    thread uint16_t * scales16 = (thread uint16_t *)&scales32;
			
 
				+    thread const int8_t * scales = (thread const int8_t *)&scales32;
			
 
				+
			
 
				+    float sumf1[2] = {0.f};
			
 
				+    float sumf2[2] = {0.f};
			
 
				+    for (int i = ix; i < nb; i += 4) {
			
 
				+
			
 
				+        for (int l = 0; l < 8; ++l) {
			
 
				+            yl[l+ 0] = y1[l+ 0];
			
 
				+            yl[l+ 8] = y1[l+16];
			
 
				+            yl[l+16] = y1[l+32];
			
 
				+            yl[l+24] = y1[l+48];
			
 
				+        }
			
 
				+
			
 
				+        device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
			
 
				+        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0);
			
 
				+        device const uint16_t * a = (device const uint16_t *)(x[i].scales);
			
 
				+        device const half * dh = &x[i].d;
			
 
				+
			
 
				+        for (int row = 0; row < 2; ++row) {
			
 
				+
			
 
				+            const float d_all = (float)dh[0];
			
 
				+
			
 
				+            scales16[0] = a[4];
			
 
				+            scales16[1] = a[5];
			
 
				+            aux32 = ((scales32 >> s_shift2) << 4) & 0x30303030;
			
 
				+            scales16[0] = a[il+0];
			
 
				+            scales16[1] = a[il+1];
			
 
				+            scales32 = ((scales32 >> s_shift1) & 0x0f0f0f0f) | aux32;
			
 
				+
			
 
				+            float s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0, s6 = 0;
			
 
				+            for (int l = 0; l < n; l += 2) {
			
 
				+                const int32_t qs = q[l/2];
			
 
				+                s1 += yl[l+0] * (qs & qm[il/2][0]);
			
 
				+                s2 += yl[l+1] * (qs & qm[il/2][1]);
			
 
				+                s3 += ((h[l/2] & hm[0]) ? 0.f : yl[l+0]) + ((h[l/2] & hm[1]) ? 0.f : yl[l+1]);
			
 
				+                s4 += yl[l+16] * (qs & qm[il/2][2]);
			
 
				+                s5 += yl[l+17] * (qs & qm[il/2][3]);
			
 
				+                s6 += ((h[l/2] & hm[2]) ? 0.f : yl[l+16]) + ((h[l/2] & hm[3]) ? 0.f : yl[l+17]);
			
 
				+            }
			
 
				+            float d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
			
 
				+            float d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
			
 
				+            sumf1[row] += d1 * (scales[0] - 32);
			
 
				+            sumf2[row] += d2 * (scales[2] - 32);
			
 
				+
			
 
				+            s1 = s2 = s3 = s4 = s5 = s6 = 0;
			
 
				+            for (int l = 0; l < n; l += 2) {
			
 
				+                const int32_t qs = q[l/2+8];
			
 
				+                s1 += yl[l+8] * (qs & qm[il/2][0]);
			
 
				+                s2 += yl[l+9] * (qs & qm[il/2][1]);
			
 
				+                s3 += ((h[l/2+8] & hm[0]) ? 0.f : yl[l+8]) + ((h[l/2+8] & hm[1]) ? 0.f : yl[l+9]);
			
 
				+                s4 += yl[l+24] * (qs & qm[il/2][2]);
			
 
				+                s5 += yl[l+25] * (qs & qm[il/2][3]);
			
 
				+                s6 += ((h[l/2+8] & hm[2]) ? 0.f : yl[l+24]) + ((h[l/2+8] & hm[3]) ? 0.f : yl[l+25]);
			
 
				+            }
			
 
				+            d1 = d_all * (s1 + 1.f/256.f * s2 - s3*v1);
			
 
				+            d2 = d_all * (s4 + 1.f/256.f * s5 - s6*v2);
			
 
				+            sumf1[row] += d1 * (scales[1] - 32);
			
 
				+            sumf2[row] += d2 * (scales[3] - 32);
			
 
				+
			
 
				+            q  += step;
			
 
				+            h  += step;
			
 
				+            a  += step;
			
 
				+            dh += step;
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        y1 += 4 * QK_K;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < 2; ++row) {
			
 
				+        const float sumf = (sumf1[row] + 0.25f * sumf2[row]) / (1 << shift);
			
 
				+        sumf1[row] = simd_sum(sumf);
			
 
				+    }
			
 
				+    if (tiisg == 0) {
			
 
				+        for (int row = 0; row < 2; ++row) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = sumf1[row];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+#else
			
 
				+void kernel_mul_mv_q3_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t r1 = tgpig.y;
			
 
				+    const int64_t im = tgpig.z;
			
 
				+
			
 
				+    const int row = 2 * r0 + sgitg;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
			
 
				+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    const int ix = tiisg/4;
			
 
				+    const int il = 4 * (tiisg%4);// 0, 4, 8, 12
			
 
				+    const int iq = il/8;         // 0, 0, 1, 1
			
 
				+    const int in = il%8;         // 0, 4, 0, 4
			
 
				+
			
 
				+    float2 sum = {0.f, 0.f};
			
 
				+
			
 
				+    for (int i = ix; i < nb; i += 8) {
			
 
				+
			
 
				+        const float d_all = (float)(x[i].d);
			
 
				+
			
 
				+        device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
			
 
				+        device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
			
 
				+        device const uint16_t * s = (device const uint16_t *)(x[i].scales);
			
 
				+        device const float    * y = yy + i * QK_K + il;
			
 
				+
			
 
				+        const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
			
 
				+        const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
			
 
				+        const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
			
 
				+        const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
			
 
				+
			
 
				+        for (int l = 0; l < 4; l += 2) {
			
 
				+            const uint16_t hm = h[l/2] >> iq;
			
 
				+            sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 :  4))
			
 
				+                    + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
			
 
				+                    + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
			
 
				+                    + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
			
 
				+            sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
			
 
				+                    + y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
			
 
				+                    + y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
			
 
				+                    + y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+    const float sumf = sum[0] + sum[1] * 1.f/256.f;
			
 
				+
			
 
				+    const float tot = simd_sum(sumf);
			
 
				+    if (tiisg == 0) {
			
 
				+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_q3_K_f32")]]
			
 
				+kernel void kernel_mul_mv_q3_K_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+void kernel_mul_mv_q4_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const uint16_t kmask1 = 0x3f3f;
			
 
				+    const uint16_t kmask2 = 0x0f0f;
			
 
				+    const uint16_t kmask3 = 0xc0c0;
			
 
				+
			
 
				+    const int ix = tiisg/8;  // 0...3
			
 
				+    const int it = tiisg%8;  // 0...7
			
 
				+    const int iq = it/4;     // 0 or 1
			
 
				+    const int ir = it%4;     // 0...3
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+    //const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int first_row = r0 * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
			
 
				+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[16];
			
 
				+    float yh[16];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int step = sizeof(block_q4_K) * nb / 2;
			
 
				+
			
 
				+    device const float * y4 = y + ix * QK_K + 64 * iq + 8 * ir;
			
 
				+
			
 
				+    uint16_t sc16[4];
			
 
				+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
			
 
				+
			
 
				+    for (int ib = ix; ib < nb; ib += 4) {
			
 
				+
			
 
				+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
			
 
				+        for (int i = 0; i < 8; ++i) {
			
 
				+            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
			
 
				+            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
			
 
				+            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
			
 
				+            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
			
 
				+        }
			
 
				+
			
 
				+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + iq;
			
 
				+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * iq + 4 * ir;
			
 
				+        device const half     * dh = &x[ib].d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            sc16[0] = sc[0] & kmask1;
			
 
				+            sc16[1] = sc[2] & kmask1;
			
 
				+            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
			
 
				+            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
			
 
				+
			
 
				+            device const uint16_t * q2 = q1 + 32;
			
 
				+
			
 
				+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
			
 
				+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
			
 
				+            for (int i = 0; i < 8; i += 2) {
			
 
				+                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
			
 
				+                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
			
 
				+                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
			
 
				+                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
			
 
				+                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
			
 
				+                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
			
 
				+                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
			
 
				+                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
			
 
				+            }
			
 
				+
			
 
				+            float dall = dh[0];
			
 
				+            float dmin = dh[1];
			
 
				+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
			
 
				+                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
			
 
				+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
			
 
				+                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
			
 
				+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
			
 
				+
			
 
				+            q1 += step;
			
 
				+            sc += step;
			
 
				+            dh += step;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 4 * QK_K;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+#else
			
 
				+void kernel_mul_mv_q4_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    const int ix = tiisg/4;  // 0...7
			
 
				+    const int it = tiisg%4;  // 0...3
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+    const int first_row = r0 * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
			
 
				+    device const float      * y = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[8];
			
 
				+    float yh[8];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int step = sizeof(block_q4_K) * nb / 2;
			
 
				+
			
 
				+    device const float * y4 = y + ix * QK_K + 8 * it;
			
 
				+
			
 
				+    uint16_t sc16[4];
			
 
				+
			
 
				+    for (int ib = ix; ib < nb; ib += 8) {
			
 
				+
			
 
				+        float2 sumy = {0.f, 0.f};
			
 
				+        for (int i = 0; i < 8; ++i) {
			
 
				+            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
			
 
				+            yh[i] = y4[i+32]; sumy[1] += yh[i];
			
 
				+        }
			
 
				+
			
 
				+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
			
 
				+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
			
 
				+        device const half     * dh = x[ib].d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            sc16[0] = sc[0] & 0x000f;
			
 
				+            sc16[1] = sc[0] & 0x0f00;
			
 
				+            sc16[2] = sc[0] & 0x00f0;
			
 
				+            sc16[3] = sc[0] & 0xf000;
			
 
				+
			
 
				+            float2 acc1 = {0.f, 0.f};
			
 
				+            float2 acc2 = {0.f, 0.f};
			
 
				+            for (int i = 0; i < 8; i += 2) {
			
 
				+                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
			
 
				+                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
			
 
				+                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
			
 
				+                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
			
 
				+            }
			
 
				+
			
 
				+            float dall = dh[0];
			
 
				+            float dmin = dh[1];
			
 
				+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
			
 
				+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
			
 
				+                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
			
 
				+
			
 
				+            qs += step;
			
 
				+            sc += step;
			
 
				+            dh += step;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 8 * QK_K;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_q4_K_f32")]]
			
 
				+kernel void kernel_mul_mv_q4_K_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_q5_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0;
			
 
				+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float sumf[2]={0.f};
			
 
				+
			
 
				+    const int step = sizeof(block_q5_K) * nb;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+#
			
 
				+    float yl[16], yh[16];
			
 
				+
			
 
				+    const uint16_t kmask1 = 0x3f3f;
			
 
				+    const uint16_t kmask2 = 0x0f0f;
			
 
				+    const uint16_t kmask3 = 0xc0c0;
			
 
				+
			
 
				+    const int tid = tiisg/4;
			
 
				+    const int ix  = tiisg%4;
			
 
				+    const int iq  = tid/4;
			
 
				+    const int ir  = tid%4;
			
 
				+    const int n   = 8;
			
 
				+
			
 
				+    const int l0 = n*ir;
			
 
				+    const int q_offset = 32*iq + l0;
			
 
				+    const int y_offset = 64*iq + l0;
			
 
				+
			
 
				+    const uint8_t hm1 = 1u << (2*iq);
			
 
				+    const uint8_t hm2 = hm1 << 1;
			
 
				+    const uint8_t hm3 = hm1 << 4;
			
 
				+    const uint8_t hm4 = hm2 << 4;
			
 
				+
			
 
				+    uint16_t sc16[4];
			
 
				+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
			
 
				+
			
 
				+    device const float * y1 = yy + ix*QK_K + y_offset;
			
 
				+
			
 
				+    for (int i = ix; i < nb; i += 4) {
			
 
				+
			
 
				+        device const uint8_t * q1 = x[i].qs + q_offset;
			
 
				+        device const uint8_t * qh = x[i].qh + l0;
			
 
				+        device const half * dh = &x[i].d;
			
 
				+        device const uint16_t * a = (device const uint16_t *)x[i].scales + iq;
			
 
				+
			
 
				+        device const float * y2 = y1 + 128;
			
 
				+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
			
 
				+        for (int l = 0; l < 8; ++l) {
			
 
				+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
			
 
				+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
			
 
				+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
			
 
				+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
			
 
				+        }
			
 
				+
			
 
				+        for (int row = 0; row < 2; ++row) {
			
 
				+
			
 
				+            device const uint8_t * q2 = q1 + 64;
			
 
				+
			
 
				+            sc16[0] = a[0] & kmask1;
			
 
				+            sc16[1] = a[2] & kmask1;
			
 
				+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
			
 
				+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
			
 
				+
			
 
				+            float4 acc1 = {0.f};
			
 
				+            float4 acc2 = {0.f};
			
 
				+            for (int l = 0; l < n; ++l) {
			
 
				+                uint8_t h = qh[l];
			
 
				+                acc1[0] += yl[l+0] * (q1[l] & 0x0F);
			
 
				+                acc1[1] += yl[l+8] * (q1[l] & 0xF0);
			
 
				+                acc1[2] += yh[l+0] * (q2[l] & 0x0F);
			
 
				+                acc1[3] += yh[l+8] * (q2[l] & 0xF0);
			
 
				+                acc2[0] += h & hm1 ? yl[l+0] : 0.f;
			
 
				+                acc2[1] += h & hm2 ? yl[l+8] : 0.f;
			
 
				+                acc2[2] += h & hm3 ? yh[l+0] : 0.f;
			
 
				+                acc2[3] += h & hm4 ? yh[l+8] : 0.f;
			
 
				+            }
			
 
				+            const float dall = dh[0];
			
 
				+            const float dmin = dh[1];
			
 
				+            sumf[row] += dall * (sc8[0] * (acc1[0] +  16.f*acc2[0]) +
			
 
				+                                 sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
			
 
				+                                 sc8[4] * (acc1[2] +  16.f*acc2[2]) +
			
 
				+                                 sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
			
 
				+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
			
 
				+
			
 
				+            q1 += step;
			
 
				+            qh += step;
			
 
				+            dh += step/2;
			
 
				+            a  += step/2;
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        y1 += 4 * QK_K;
			
 
				+
			
 
				+    }
			
 
				+#else
			
 
				+    float yl[8], yh[8];
			
 
				+
			
 
				+    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
			
 
				+    const int ix = tiisg%8;
			
 
				+    const int iq = il/8;         // 0, 0, 1, 1
			
 
				+    const int in = il%8;         // 0, 4, 0, 4
			
 
				+
			
 
				+    device const float * y = yy + ix*QK_K + il;
			
 
				+
			
 
				+    for (int i = ix; i < nb; i += 8) {
			
 
				+
			
 
				+        for (int l = 0; l < 4; ++l) {
			
 
				+            yl[l+0] = y[l+ 0];
			
 
				+            yl[l+4] = y[l+16];
			
 
				+            yh[l+0] = y[l+32];
			
 
				+            yh[l+4] = y[l+48];
			
 
				+        }
			
 
				+
			
 
				+        device const half * dh = &x[i].d;
			
 
				+        device const uint8_t * q = x[i].qs + il;
			
 
				+        device const uint8_t * h = x[i].qh + in;
			
 
				+        device const int8_t  * s = x[i].scales;
			
 
				+
			
 
				+        for (int row = 0; row < 2; ++row) {
			
 
				+
			
 
				+            const float d = dh[0];
			
 
				+
			
 
				+            float2 acc = {0.f, 0.f};
			
 
				+            for (int l = 0; l < 4; ++l) {
			
 
				+                const uint8_t hl = h[l] >> iq;
			
 
				+                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
			
 
				+                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
			
 
				+                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
			
 
				+                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
			
 
				+            }
			
 
				+            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
			
 
				+
			
 
				+            q += step;
			
 
				+            h += step;
			
 
				+            s += step;
			
 
				+            dh += step/2;
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        y += 8 * QK_K;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    for (int row = 0; row < 2; ++row) {
			
 
				+        const float tot = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_q5_K_f32")]]
			
 
				+kernel void kernel_mul_mv_q5_K_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_q6_K_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const uint8_t kmask1 = 0x03;
			
 
				+    const uint8_t kmask2 = 0x0C;
			
 
				+    const uint8_t kmask3 = 0x30;
			
 
				+    const uint8_t kmask4 = 0xC0;
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+
			
 
				+    const int64_t r0 = tgpig.x;
			
 
				+    const int64_t r1 = tgpig.y;
			
 
				+    const int     im = tgpig.z;
			
 
				+
			
 
				+    const int row = 2 * r0 + sgitg;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0;
			
 
				+    device const float     * yy = (device const float      *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float sumf = 0;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    const int tid  = tiisg/2;
			
 
				+    const int ix   = tiisg%2;
			
 
				+    const int ip   = tid/8;         // 0 or 1
			
 
				+    const int il   = tid%8;
			
 
				+    const int n    = 4;
			
 
				+    const int l0   = n*il;
			
 
				+    const int is   = 8*ip + l0/16;
			
 
				+
			
 
				+    const int y_offset = 128*ip + l0;
			
 
				+    const int q_offset_l = 64*ip + l0;
			
 
				+    const int q_offset_h = 32*ip + l0;
			
 
				+
			
 
				+    for (int i = ix; i < nb; i += 2) {
			
 
				+
			
 
				+        device const uint8_t * q1 = x[i].ql + q_offset_l;
			
 
				+        device const uint8_t * q2 = q1 + 32;
			
 
				+        device const uint8_t * qh = x[i].qh + q_offset_h;
			
 
				+        device const int8_t  * sc = x[i].scales + is;
			
 
				+
			
 
				+        device const float * y = yy + i * QK_K + y_offset;
			
 
				+
			
 
				+        const float dall = x[i].d;
			
 
				+
			
 
				+        float4 sums = {0.f, 0.f, 0.f, 0.f};
			
 
				+        for (int l = 0; l < n; ++l) {
			
 
				+            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
			
 
				+            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
			
 
				+            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
			
 
				+            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
			
 
				+        }
			
 
				+
			
 
				+        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+#else
			
 
				+    const int ix  = tiisg/4;
			
 
				+    const int il  = 4*(tiisg%4);
			
 
				+
			
 
				+    for (int i = ix; i < nb; i += 8) {
			
 
				+        device const float * y = yy + i * QK_K + il;
			
 
				+        device const uint8_t * ql = x[i].ql + il;
			
 
				+        device const uint8_t * qh = x[i].qh + il;
			
 
				+        device const int8_t  * s  = x[i].scales;
			
 
				+
			
 
				+        const float d = x[i].d;
			
 
				+
			
 
				+        float4 sums = {0.f, 0.f, 0.f, 0.f};
			
 
				+        for (int l = 0; l < 4; ++l) {
			
 
				+            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
			
 
				+            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
			
 
				+            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
			
 
				+            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
			
 
				+        }
			
 
				+        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
			
 
				+    }
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+    const float tot = simd_sum(sumf);
			
 
				+    if (tiisg == 0) {
			
 
				+        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_q6_K_f32")]]
			
 
				+kernel void kernel_mul_mv_q6_K_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+// ======================= "True" 2-bit
			
 
				+
			
 
				+void kernel_mul_mv_iq2_xxs_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0;
			
 
				+    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
			
 
				+    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
			
 
				+    {
			
 
				+        int nval = 4;
			
 
				+        int pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i];
			
 
				+        nval = 2;
			
 
				+        pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        for (int i = 0; i < 32; ++i) {
			
 
				+            yl[i] = y4[i];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq2_xxs * xr = x + ibl;
			
 
				+        device const uint16_t * q2 = xr->qs + 4 * ib;
			
 
				+        device const half * dh = &xr->d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            const float db = dh[0];
			
 
				+            device const uint8_t * aux8 = (device const uint8_t *)q2;
			
 
				+            const uint32_t aux32 = q2[2] | (q2[3] << 16);
			
 
				+            const float d = db * (0.5f + (aux32 >> 28));
			
 
				+
			
 
				+            float sum = 0;
			
 
				+            for (int l = 0; l < 4; ++l) {
			
 
				+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + aux8[l]);
			
 
				+                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
			
 
				+                for (int j = 0; j < 8; ++j) {
			
 
				+                    sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				+                }
			
 
				+            }
			
 
				+            sumf[row] += d * sum;
			
 
				+
			
 
				+            dh += nb*sizeof(block_iq2_xxs)/2;
			
 
				+            q2 += nb*sizeof(block_iq2_xxs)/2;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq2_xxs_f32")]]
			
 
				+kernel void kernel_mul_mv_iq2_xxs_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq2_xs_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0;
			
 
				+    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
			
 
				+    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 512);
			
 
				+    {
			
 
				+        int nval = 8;
			
 
				+        int pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i];
			
 
				+        nval = 2;
			
 
				+        pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        for (int i = 0; i < 32; ++i) {
			
 
				+            yl[i] = y4[i];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq2_xs * xr = x + ibl;
			
 
				+        device const uint16_t * q2 = xr->qs + 4 * ib;
			
 
				+        device const uint8_t  * sc = xr->scales + ib;
			
 
				+        device const half * dh = &xr->d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            const float db = dh[0];
			
 
				+            const uint8_t ls1 = sc[0] & 0xf;
			
 
				+            const uint8_t ls2 = sc[0] >>  4;
			
 
				+            const float d1 = db * (0.5f + ls1);
			
 
				+            const float d2 = db * (0.5f + ls2);
			
 
				+
			
 
				+            float sum1 = 0, sum2 = 0;
			
 
				+            for (int l = 0; l < 2; ++l) {
			
 
				+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
			
 
				+                const uint8_t signs = shared_signs[(q2[l] >> 9)];
			
 
				+                for (int j = 0; j < 8; ++j) {
			
 
				+                    sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				+                }
			
 
				+            }
			
 
				+            for (int l = 2; l < 4; ++l) {
			
 
				+                const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511));
			
 
				+                const uint8_t signs = shared_signs[(q2[l] >> 9)];
			
 
				+                for (int j = 0; j < 8; ++j) {
			
 
				+                    sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
			
 
				+                }
			
 
				+            }
			
 
				+            sumf[row] += d1 * sum1 + d2 * sum2;
			
 
				+
			
 
				+            dh += nb*sizeof(block_iq2_xs)/2;
			
 
				+            q2 += nb*sizeof(block_iq2_xs)/2;
			
 
				+            sc += nb*sizeof(block_iq2_xs);
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq2_xs_f32")]]
			
 
				+kernel void kernel_mul_mv_iq2_xs_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq3_xxs_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0;
			
 
				+    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
			
 
				+    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
			
 
				+    {
			
 
				+        int nval = 4;
			
 
				+        int pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i];
			
 
				+        nval = 2;
			
 
				+        pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        for (int i = 0; i < 32; ++i) {
			
 
				+            yl[i] = y4[i];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq3_xxs * xr = x + ibl;
			
 
				+        device const uint8_t  * q3 = xr->qs + 8 * ib;
			
 
				+        device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib;
			
 
				+        device const half * dh = &xr->d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            const float db = dh[0];
			
 
				+            const uint32_t aux32 = gas[0] | (gas[1] << 16);
			
 
				+            const float d = db * (0.5f + (aux32 >> 28));
			
 
				+
			
 
				+            float2 sum = {0};
			
 
				+            for (int l = 0; l < 4; ++l) {
			
 
				+                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]);
			
 
				+                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]);
			
 
				+                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
			
 
				+                for (int j = 0; j < 4; ++j) {
			
 
				+                    sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
			
 
				+                    sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
			
 
				+                }
			
 
				+            }
			
 
				+            sumf[row] += d * (sum[0] + sum[1]);
			
 
				+
			
 
				+            dh  += nb*sizeof(block_iq3_xxs)/2;
			
 
				+            q3  += nb*sizeof(block_iq3_xxs);
			
 
				+            gas += nb*sizeof(block_iq3_xxs)/2;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq3_xxs_f32")]]
			
 
				+kernel void kernel_mul_mv_iq3_xxs_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq3_s_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0;
			
 
				+    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
			
 
				+    {
			
 
				+        int nval = 8;
			
 
				+        int pos  = (32*sgitg + tiisg)*nval;
			
 
				+        for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i];
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    }
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        for (int i = 0; i < 32; ++i) {
			
 
				+            yl[i] = y4[i];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq3_s * xr = x + ibl;
			
 
				+        device const uint8_t * qs = xr->qs + 8 * ib;
			
 
				+        device const uint8_t * qh = xr->qh + ib;
			
 
				+        device const uint8_t * sc = xr->scales + (ib/2);
			
 
				+        device const uint8_t * signs = xr->signs + 4 * ib;
			
 
				+        device const half * dh = &xr->d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            const float db = dh[0];
			
 
				+            const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf));
			
 
				+
			
 
				+            float2 sum = {0};
			
 
				+            for (int l = 0; l < 4; ++l) {
			
 
				+                const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values;
			
 
				+                const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values;
			
 
				+                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]);
			
 
				+                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]);
			
 
				+                for (int j = 0; j < 4; ++j) {
			
 
				+                    sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]);
			
 
				+                    sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]);
			
 
				+                }
			
 
				+            }
			
 
				+            sumf[row] += d * (sum[0] + sum[1]);
			
 
				+
			
 
				+            dh  += nb*sizeof(block_iq3_s)/2;
			
 
				+            qs  += nb*sizeof(block_iq3_s);
			
 
				+            qh  += nb*sizeof(block_iq3_s);
			
 
				+            sc  += nb*sizeof(block_iq3_s);
			
 
				+            signs += nb*sizeof(block_iq3_s);
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq3_s_f32")]]
			
 
				+kernel void kernel_mul_mv_iq3_s_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq2_s_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+
			
 
				+    device const block_iq2_s * x = (device const block_iq2_s *) src0 + ib_row + offset0;
			
 
				+    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    //threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values;
			
 
				+    //{
			
 
				+    //    int nval = 32;
			
 
				+    //    int pos  = (32*sgitg + tiisg)*nval;
			
 
				+    //    for (int i = 0; i < nval; ++i) values[pos + i] = iq2s_grid[pos + i];
			
 
				+    //    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+    //}
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        for (int i = 0; i < 32; ++i) {
			
 
				+            yl[i] = y4[i];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq2_s * xr = x + ibl;
			
 
				+        device const uint8_t * qs = xr->qs + 4 * ib;
			
 
				+        device const uint8_t * qh = xr->qh + ib;
			
 
				+        device const uint8_t * sc = xr->scales + ib;
			
 
				+        device const uint8_t * signs = qs + QK_K/8;
			
 
				+        device const half * dh = &xr->d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            const float db = dh[0];
			
 
				+            const float d1 = db * (0.5f + (sc[0] & 0xf));
			
 
				+            const float d2 = db * (0.5f + (sc[0] >>  4));
			
 
				+
			
 
				+            float2 sum = {0};
			
 
				+            for (int l = 0; l < 2; ++l) {
			
 
				+                //const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
			
 
				+                //const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
			
 
				+                constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[l+0] | ((qh[0] << (8-2*l)) & 0x300)));
			
 
				+                constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[l+2] | ((qh[0] << (4-2*l)) & 0x300)));
			
 
				+                for (int j = 0; j < 8; ++j) {
			
 
				+                    sum[0] += yl[8*l + j +  0] * grid1[j] * select(1, -1, signs[l+0] & kmask_iq2xs[j]);
			
 
				+                    sum[1] += yl[8*l + j + 16] * grid2[j] * select(1, -1, signs[l+2] & kmask_iq2xs[j]);
			
 
				+                }
			
 
				+            }
			
 
				+            sumf[row] += d1 * sum[0] + d2 * sum[1];
			
 
				+
			
 
				+            dh  += nb*sizeof(block_iq2_s)/2;
			
 
				+            qs  += nb*sizeof(block_iq2_s);
			
 
				+            qh  += nb*sizeof(block_iq2_s);
			
 
				+            sc  += nb*sizeof(block_iq2_s);
			
 
				+            signs += nb*sizeof(block_iq2_s);
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq2_s_f32")]]
			
 
				+kernel void kernel_mul_mv_iq2_s_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq2_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq1_s_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_value,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+    device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
			
 
				+    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        float sumy = 0;
			
 
				+        for (int i = 0; i < 32; ++i) {
			
 
				+            yl[i] = y4[i];
			
 
				+            sumy += yl[i];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq1_s * xr = x + ibl;
			
 
				+        device const uint8_t  * qs = xr->qs + 4 * ib;
			
 
				+        device const uint16_t * qh = xr->qh + ib;
			
 
				+        device const half     * dh = &xr->d;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+            constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
			
 
				+            constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700)));
			
 
				+            constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700)));
			
 
				+            constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700)));
			
 
				+
			
 
				+            float sum = 0;
			
 
				+            for (int j = 0; j < 4; ++j) {
			
 
				+                sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4)
			
 
				+                     + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4)
			
 
				+                     + yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
			
 
				+                     + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
			
 
				+            }
			
 
				+            sumf[row] += (float)dh[0] * (sum + sumy * (qh[0] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA)) * (2*((qh[0] >> 12) & 7) + 1);
			
 
				+
			
 
				+            dh += nb*sizeof(block_iq1_s)/2;
			
 
				+            qs += nb*sizeof(block_iq1_s);
			
 
				+            qh += nb*sizeof(block_iq1_s)/2;
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq1_m_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_value,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+
			
 
				+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+    device const block_iq1_m * x = (device const block_iq1_m *) src0 + ib_row + offset0;
			
 
				+    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    float yl[32];
			
 
				+    float sumf[N_DST]={0.f}, all_sum;
			
 
				+
			
 
				+    const int nb32 = nb * (QK_K / 32);
			
 
				+
			
 
				+    const int ix = tiisg;
			
 
				+
			
 
				+    device const float * y4 = y + 32 * ix;
			
 
				+
			
 
				+#if QK_K != 64
			
 
				+    iq1m_scale_t scale;
			
 
				+#endif
			
 
				+
			
 
				+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
			
 
				+
			
 
				+        float4 sumy = {0.f};
			
 
				+        for (int i = 0; i < 8; ++i) {
			
 
				+            yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
			
 
				+            yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8];
			
 
				+            yl[i+16] = y4[i+16]; sumy[2] += yl[i+16];
			
 
				+            yl[i+24] = y4[i+24]; sumy[3] += yl[i+24];
			
 
				+        }
			
 
				+
			
 
				+        const int ibl = ib32 / (QK_K / 32);
			
 
				+        const int ib  = ib32 % (QK_K / 32);
			
 
				+
			
 
				+        device const block_iq1_m * xr = x + ibl;
			
 
				+        device const uint8_t  * qs = xr->qs + 4 * ib;
			
 
				+        device const uint8_t  * qh = xr->qh + 2 * ib;
			
 
				+        device const uint16_t * sc = (device const uint16_t *)xr->scales;
			
 
				+
			
 
				+        for (int row = 0; row < N_DST; row++) {
			
 
				+
			
 
				+#if QK_K != 64
			
 
				+            scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				+#endif
			
 
				+
			
 
				+            constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
			
 
				+            constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
			
 
				+            constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700)));
			
 
				+            constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700)));
			
 
				+
			
 
				+            float2 sum = {0.f};
			
 
				+            for (int j = 0; j < 4; ++j) {
			
 
				+                sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4)
			
 
				+                        + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4);
			
 
				+                sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4)
			
 
				+                        + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4);
			
 
				+            }
			
 
				+            const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				+            const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				+#if QK_K == 64
			
 
				+            const float d = (float) *((device const half *)(sc - 1));
			
 
				+            sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
			
 
				+                              (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
			
 
				+#else
			
 
				+            sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
			
 
				+                                             (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
			
 
				+#endif
			
 
				+
			
 
				+            sc += nb*sizeof(block_iq1_m)/2;
			
 
				+            qs += nb*sizeof(block_iq1_m);
			
 
				+            qh += nb*sizeof(block_iq1_m);
			
 
				+        }
			
 
				+
			
 
				+        y4 += 32 * 32;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < N_DST; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void kernel_mul_mv_iq4_nl_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values_i8,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    threadgroup float * shared_values = (threadgroup float *)shared_values_i8;
			
 
				+    const int nb = ne00/QK4_NL;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+    const int first_row = (r0 * 2 + sgitg) * 2;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+    device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
			
 
				+    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    const int ix = tiisg/2;  // 0...15
			
 
				+    const int it = tiisg%2;  // 0 or 1
			
 
				+
			
 
				+    shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    float4 yl[4];
			
 
				+    float sumf[2]={0.f}, all_sum;
			
 
				+
			
 
				+    device const float * yb = y + ix * QK4_NL + it * 8;
			
 
				+
			
 
				+    uint32_t aux32[2];
			
 
				+    thread const uint8_t * q8 = (thread const uint8_t *)aux32;
			
 
				+
			
 
				+    float4 qf1, qf2;
			
 
				+
			
 
				+    for (int ib = ix; ib < nb; ib += 16) {
			
 
				+
			
 
				+        device const float4 * y4 = (device const float4 *)yb;
			
 
				+        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
			
 
				+
			
 
				+        for (int row = 0; row < 2; ++row) {
			
 
				+
			
 
				+            device const block_iq4_nl & xb = x[row*nb + ib];
			
 
				+            device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
			
 
				+
			
 
				+            float4 acc1 = {0.f}, acc2 = {0.f};
			
 
				+
			
 
				+            aux32[0] = q4[0] | (q4[1] << 16);
			
 
				+            aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
			
 
				+            aux32[0] &= 0x0f0f0f0f;
			
 
				+            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
			
 
				+            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
			
 
				+            acc1 += yl[0] * qf1;
			
 
				+            acc2 += yl[1] * qf2;
			
 
				+
			
 
				+            aux32[0] = q4[2] | (q4[3] << 16);
			
 
				+            aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
			
 
				+            aux32[0] &= 0x0f0f0f0f;
			
 
				+            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
			
 
				+            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
			
 
				+            acc1 += yl[2] * qf1;
			
 
				+            acc2 += yl[3] * qf2;
			
 
				+
			
 
				+            acc1 += acc2;
			
 
				+
			
 
				+            sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        yb += 16 * QK4_NL;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < 2; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#if QK_K != 64
			
 
				+void kernel_mul_mv_iq4_xs_f32_impl(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values_i8,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg) {
			
 
				+
			
 
				+    threadgroup float * shared_values = (threadgroup float *)shared_values_i8;
			
 
				+    const int nb = ne00/QK_K;
			
 
				+    const int r0 = tgpig.x;
			
 
				+    const int r1 = tgpig.y;
			
 
				+    const int im = tgpig.z;
			
 
				+    const int first_row = (r0 * 2 + sgitg) * 2;
			
 
				+    const int ib_row = first_row * nb;
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
			
 
				+    device const block_iq4_xs * x = (device const block_iq4_xs *) src0 + ib_row + offset0;
			
 
				+    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
			
 
				+
			
 
				+    const int ix = tiisg/16;  // 0 or 1
			
 
				+    const int it = tiisg%16;  // 0...15
			
 
				+    const int ib = it/2;
			
 
				+    const int il = it%2;
			
 
				+
			
 
				+    shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    float4 yl[4];
			
 
				+    float sumf[2]={0.f}, all_sum;
			
 
				+
			
 
				+    device const float * yb = y + ix * QK_K + ib * 32 + il * 8;
			
 
				+
			
 
				+    uint32_t aux32[2];
			
 
				+    thread const uint8_t * q8 = (thread const uint8_t *)aux32;
			
 
				+
			
 
				+    float4 qf1, qf2;
			
 
				+
			
 
				+    for (int ibl = ix; ibl < nb; ibl += 2) {
			
 
				+
			
 
				+        device const float4 * y4 = (device const float4 *)yb;
			
 
				+        yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
			
 
				+
			
 
				+        for (int row = 0; row < 2; ++row) {
			
 
				+
			
 
				+            device const block_iq4_xs & xb = x[row*nb + ibl];
			
 
				+            device const uint32_t * q4 = (device const uint32_t *)(xb.qs + 16*ib + 8*il);
			
 
				+
			
 
				+            float4 acc1 = {0.f}, acc2 = {0.f};
			
 
				+
			
 
				+            aux32[0] = q4[0] & 0x0f0f0f0f;
			
 
				+            aux32[1] = (q4[0] >> 4) & 0x0f0f0f0f;
			
 
				+            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
			
 
				+            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
			
 
				+            acc1 += yl[0] * qf1;
			
 
				+            acc2 += yl[1] * qf2;
			
 
				+
			
 
				+            aux32[0] = q4[1] & 0x0f0f0f0f;
			
 
				+            aux32[1] = (q4[1] >> 4) & 0x0f0f0f0f;
			
 
				+            qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
			
 
				+            qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
			
 
				+            acc1 += yl[2] * qf1;
			
 
				+            acc2 += yl[3] * qf2;
			
 
				+
			
 
				+            acc1 += acc2;
			
 
				+
			
 
				+            const int ls = (((xb.scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((xb.scales_h >> 2*ib) & 3) << 4)) - 32;
			
 
				+            sumf[row] += (float)xb.d * ls * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        yb += 2 * QK_K;
			
 
				+    }
			
 
				+
			
 
				+    for (int row = 0; row < 2; ++row) {
			
 
				+        all_sum = simd_sum(sumf[row]);
			
 
				+        if (tiisg == 0) {
			
 
				+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq1_s_f32")]]
			
 
				+kernel void kernel_mul_mv_iq1_s_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq1_m_f32")]]
			
 
				+kernel void kernel_mul_mv_iq1_m_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint  tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq4_nl_f32")]]
			
 
				+kernel void kernel_mul_mv_iq4_nl_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+}
			
 
				+
			
 
				+[[host_name("kernel_mul_mv_iq4_xs_f32")]]
			
 
				+kernel void kernel_mul_mv_iq4_xs_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant   int64_t & ne01,
			
 
				+        constant   int64_t & ne02,
			
 
				+        constant  uint64_t & nb00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant   int64_t & ne11,
			
 
				+        constant   int64_t & ne12,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb12,
			
 
				+        constant   int64_t & ne0,
			
 
				+        constant   int64_t & ne1,
			
 
				+        constant   uint    & r2,
			
 
				+        constant   uint    & r3,
			
 
				+        threadgroup int8_t * shared_values [[threadgroup(0)]],
			
 
				+        uint3 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+#if QK_K == 64
			
 
				+    kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+#else
			
 
				+    kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+//============================= templates and their specializations =============================
			
 
				+
			
 
				+// NOTE: this is not dequantizing - we are simply fitting the template
			
 
				+template <typename type4x4>
			
 
				+void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
			
 
				+    float4x4 temp = *(((device float4x4 *)src));
			
 
				+    for (int i = 0; i < 16; i++){
			
 
				+        reg[i/4][i%4] = temp[i/4][i%4];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
			
 
				+    half4x4 temp = *(((device half4x4 *)src));
			
 
				+    for (int i = 0; i < 16; i++){
			
 
				+        reg[i/4][i%4] = temp[i/4][i%4];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
			
 
				+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
			
 
				+    const float d1 = il ? (xb->d / 16.h) : xb->d;
			
 
				+    const float d2 = d1 / 256.f;
			
 
				+    const float md = -8.h * xb->d;
			
 
				+    const ushort mask0 = il ? 0x00F0 : 0x000F;
			
 
				+    const ushort mask1 = mask0 << 8;
			
 
				+
			
 
				+    for (int i=0;i<8;i++) {
			
 
				+        reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
			
 
				+        reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
			
 
				+    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
			
 
				+    const float d1 = il ? (xb->d / 16.h) : xb->d;
			
 
				+    const float d2 = d1 / 256.f;
			
 
				+    const float  m = xb->m;
			
 
				+    const ushort mask0 = il ? 0x00F0 : 0x000F;
			
 
				+    const ushort mask1 = mask0 << 8;
			
 
				+
			
 
				+    for (int i=0;i<8;i++) {
			
 
				+        reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m;
			
 
				+        reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q5_0(device const block_q5_0 *xb, short il, thread type4x4 & reg) {
			
 
				+    device const uint16_t * qs = ((device const uint16_t *)xb + 3);
			
 
				+    const float d = xb->d;
			
 
				+    const float md = -16.h * xb->d;
			
 
				+    const ushort mask = il ? 0x00F0 : 0x000F;
			
 
				+
			
 
				+    const uint32_t qh = *((device const uint32_t *)xb->qh);
			
 
				+
			
 
				+    const int x_mv = il ? 4 : 0;
			
 
				+
			
 
				+    const int gh_mv = il ? 12 : 0;
			
 
				+    const int gh_bk = il ?  0 : 4;
			
 
				+
			
 
				+    for (int i = 0; i < 8; i++) {
			
 
				+        // extract the 5-th bits for x0 and x1
			
 
				+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
			
 
				+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
			
 
				+
			
 
				+        // combine the 4-bits from qs with the 5th bit
			
 
				+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
			
 
				+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
			
 
				+
			
 
				+        reg[i/2][2*(i%2)+0] = d * x0 + md;
			
 
				+        reg[i/2][2*(i%2)+1] = d * x1 + md;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q5_1(device const block_q5_1 *xb, short il, thread type4x4 & reg) {
			
 
				+    device const uint16_t * qs = ((device const uint16_t *)xb + 4);
			
 
				+    const float d = xb->d;
			
 
				+    const float m = xb->m;
			
 
				+    const ushort mask = il ? 0x00F0 : 0x000F;
			
 
				+
			
 
				+    const uint32_t qh = *((device const uint32_t *)xb->qh);
			
 
				+
			
 
				+    const int x_mv = il ? 4 : 0;
			
 
				+
			
 
				+    const int gh_mv = il ? 12 : 0;
			
 
				+    const int gh_bk = il ?  0 : 4;
			
 
				+
			
 
				+    for (int i = 0; i < 8; i++) {
			
 
				+        // extract the 5-th bits for x0 and x1
			
 
				+        const uint8_t xh_0 = ((qh >> (gh_mv + 2*i  )) << gh_bk) & 0x10;
			
 
				+        const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
			
 
				+
			
 
				+        // combine the 4-bits from qs with the 5th bit
			
 
				+        const int32_t x0 = ((((qs[i]     ) & mask) >> x_mv) | xh_0);
			
 
				+        const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
			
 
				+
			
 
				+        reg[i/2][2*(i%2)+0] = d * x0 + m;
			
 
				+        reg[i/2][2*(i%2)+1] = d * x1 + m;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
			
 
				+    device const int8_t * qs = ((device const int8_t *)xb->qs);
			
 
				+    const half d = xb->d;
			
 
				+
			
 
				+    for (int i = 0; i < 16; i++) {
			
 
				+        reg[i/4][i%4] = (qs[i + 16*il] * d);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
			
 
				+    const float d = xb->d;
			
 
				+    const float min = xb->dmin;
			
 
				+    device const uint8_t * q = (device const uint8_t *)xb->qs;
			
 
				+    float dl, ml;
			
 
				+    uint8_t sc = xb->scales[il];
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    q = q + 32*(il/8) + 16*(il&1);
			
 
				+    il = (il/2)%4;
			
 
				+#endif
			
 
				+    half  coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
			
 
				+    uchar mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
			
 
				+    dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
			
 
				+    const half d_all = xb->d;
			
 
				+    device const uint8_t * q = (device const uint8_t *)xb->qs;
			
 
				+    device const uint8_t * h = (device const uint8_t *)xb->hmask;
			
 
				+    device const int8_t * scales = (device const int8_t *)xb->scales;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    q = q + 32 * (il/8) + 16 * (il&1);
			
 
				+    h = h + 16 * (il&1);
			
 
				+    uint8_t m = 1 << (il/2);
			
 
				+    uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
			
 
				+                                 ((il/4)>0 ? 12  : 3);
			
 
				+    uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
			
 
				+    uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
			
 
				+    int16_t  dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
			
 
				+                               : (scale_2&kmask2) | ((scale_1&kmask1) << 4);
			
 
				+    float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
			
 
				+    const float ml = 4.f * dl;
			
 
				+
			
 
				+    il = (il/2) & 3;
			
 
				+    const half    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
			
 
				+    const uint8_t mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
			
 
				+    dl *= coef;
			
 
				+
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
			
 
				+    }
			
 
				+#else
			
 
				+    float    kcoef = il&1 ? 1.f/16.f : 1.f;
			
 
				+    uint16_t kmask = il&1 ? 0xF0     : 0x0F;
			
 
				+    float    dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
			
 
				+    float    coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
			
 
				+    uint8_t  mask = il>1 ? (il>2 ? 192    : 48)     : (il>0 ? 12    : 3);
			
 
				+    uint8_t  m = 1<<(il*2);
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
			
 
				+    }
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
			
 
				+    return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
			
 
				+                 : uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
			
 
				+    device const uchar * q = xb->qs;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    short is = (il/4) * 2;
			
 
				+    q = q + (il/4) * 32 + 16 * (il&1);
			
 
				+    il = il & 3;
			
 
				+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
			
 
				+    const float d   = il < 2 ? xb->d : xb->d / 16.h;
			
 
				+    const float min = xb->dmin;
			
 
				+    const float dl = d * sc[0];
			
 
				+    const float ml = min * sc[1];
			
 
				+#else
			
 
				+    (void) get_scale_min_k4_just2;
			
 
				+
			
 
				+    q = q + 16 * (il&1);
			
 
				+    device const uint8_t * s = xb->scales;
			
 
				+    device const half2 * dh = (device const half2 *)xb->d;
			
 
				+    const float2 d = (float2)dh[0];
			
 
				+    const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
			
 
				+    const float ml = il<2 ? d[1] * (s[0]>>4)  : d[1] * (s[1]>>4);
			
 
				+#endif
			
 
				+    const ushort mask = il<2 ? 0x0F : 0xF0;
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        reg[i/4][i%4] = dl * (q[i] & mask) - ml;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
			
 
				+    device const uint8_t * q  = xb->qs;
			
 
				+    device const uint8_t * qh = xb->qh;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    short is = (il/4) * 2;
			
 
				+    q  = q + 32 * (il/4) + 16 * (il&1);
			
 
				+    qh = qh + 16 * (il&1);
			
 
				+    uint8_t ul = 1 << (il/2);
			
 
				+    il = il & 3;
			
 
				+    const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
			
 
				+    const float d = il < 2 ? xb->d : xb->d / 16.f;
			
 
				+    const float min = xb->dmin;
			
 
				+    const float dl = d * sc[0];
			
 
				+    const float ml = min * sc[1];
			
 
				+
			
 
				+    const ushort mask  = il<2 ? 0x0F : 0xF0;
			
 
				+    const float qh_val = il<2 ? 16.f : 256.f;
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
			
 
				+    }
			
 
				+#else
			
 
				+    q = q + 16 * (il&1);
			
 
				+    device const int8_t * s = xb->scales;
			
 
				+    const float dl = xb->d * s[il];
			
 
				+    uint8_t m = 1<<(il*2);
			
 
				+    const float  coef = il<2 ? 1.f  : 1.f/16.f;
			
 
				+    const ushort mask = il<2 ? 0x0F : 0xF0;
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
			
 
				+    }
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
			
 
				+    const half d_all = xb->d;
			
 
				+    device const uint8_t * ql = (device const uint8_t *)xb->ql;
			
 
				+    device const uint8_t * qh = (device const uint8_t *)xb->qh;
			
 
				+    device const int8_t * scales = (device const int8_t *)xb->scales;
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+    ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
			
 
				+    qh = qh + 32*(il/8) + 16*(il&1);
			
 
				+    float sc = scales[(il%2) + 2 * ((il/2))];
			
 
				+    il = (il/2) & 3;
			
 
				+#else
			
 
				+    ql = ql + 16 * (il&1);
			
 
				+    float sc = scales[il];
			
 
				+#endif
			
 
				+    const uint16_t  kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
			
 
				+    const uint16_t  kmask2 = il>1 ? 0xF0              : 0x0F;
			
 
				+    const float       coef = il>1 ? 1.f/16.f          : 1.f;
			
 
				+    const float ml = d_all * sc * 32.f;
			
 
				+    const float dl = d_all * sc * coef;
			
 
				+    for (int i = 0; i < 16; ++i) {
			
 
				+        const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2))
			
 
				+                            : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4));
			
 
				+        reg[i/4][i%4] = dl * q - ml;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const float d = xb->d;
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
			
 
				+    // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
			
 
				+    device const uint16_t * q2 = xb->qs + 4*ib32;
			
 
				+    const uint32_t aux32_g = q2[0] | (q2[1] << 16);
			
 
				+    const uint32_t aux32_s = q2[2] | (q2[3] << 16);
			
 
				+    thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
			
 
				+    const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
			
 
				+    constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
			
 
				+    uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
			
 
				+    for (int i = 0; i < 8; ++i) {
			
 
				+        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
			
 
				+    }
			
 
				+    grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
			
 
				+    signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
			
 
				+    for (int i = 0; i < 8; ++i) {
			
 
				+        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const float d = xb->d;
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
			
 
				+    device const uint16_t * q2 = xb->qs + 4*ib32;
			
 
				+    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
			
 
				+    constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
			
 
				+    uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
			
 
				+    for (int i = 0; i < 8; ++i) {
			
 
				+        reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
			
 
				+    }
			
 
				+    grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
			
 
				+    signs = ksigns_iq2xs[q2[2*il+1] >> 9];
			
 
				+    for (int i = 0; i < 8; ++i) {
			
 
				+        reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const float d = xb->d;
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
			
 
				+    device const uint8_t * q3 = xb->qs + 8*ib32;
			
 
				+    device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
			
 
				+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
			
 
				+    const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
			
 
				+    constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
			
 
				+    constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
			
 
				+    uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
			
 
				+        reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
			
 
				+    }
			
 
				+    grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
			
 
				+    grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
			
 
				+    signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
			
 
				+        reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const float d = xb->d;
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
			
 
				+    device const uint8_t * qs = xb->qs + 8*ib32;
			
 
				+    device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
			
 
				+    const uint8_t qh = xb->qh[ib32] >> 4*il;
			
 
				+    const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
			
 
				+    constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
			
 
				+    constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
			
 
				+        reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
			
 
				+    }
			
 
				+    grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
			
 
				+    grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
			
 
				+        reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const float d = xb->d;
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
			
 
				+    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
			
 
				+    device const uint8_t * signs = qs + QK_K/8;
			
 
				+    const uint8_t qh = xb->qh[ib32] >> 4*il;
			
 
				+    const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
			
 
				+    constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
			
 
				+    constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
			
 
				+    for (int i = 0; i < 8; ++i) {
			
 
				+        reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
			
 
				+        reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    const float d = xb->d;
			
 
				+    device const uint8_t  * qs = xb->qs + 4*ib32 + 2*il;
			
 
				+    device const uint16_t * qh = xb->qh;
			
 
				+    const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
			
 
				+    const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
			
 
				+    const uint16_t h = qh[ib32] >> 6*il;
			
 
				+    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
			
 
				+    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        reg[0][i] = dl * (grid1[i] & 0xf) + ml;
			
 
				+        reg[1][i] = dl * (grid1[i] >>  4) + ml;
			
 
				+        reg[2][i] = dl * (grid2[i] & 0xf) + ml;
			
 
				+        reg[3][i] = dl * (grid2[i] >>  4) + ml;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    device const uint16_t * sc = (device const uint16_t *)xb->scales;
			
 
				+#if QK_K == 64
			
 
				+    const float d = xb->d;
			
 
				+#else
			
 
				+    iq1m_scale_t scale;
			
 
				+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
			
 
				+    const float d = scale.f16;
			
 
				+#endif
			
 
				+    device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
			
 
				+    device const uint8_t * qh = xb->qh + 2*ib32 + il;
			
 
				+#if QK_K == 64
			
 
				+    const float dl  = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
			
 
				+#else
			
 
				+    const float dl  = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
			
 
				+#endif
			
 
				+    const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				+    const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
			
 
				+    constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
			
 
				+    constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
			
 
				+        reg[1][i] = dl * (grid1[i] >>  4) + ml1;
			
 
				+        reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
			
 
				+        reg[3][i] = dl * (grid2[i] >>  4) + ml2;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
			
 
				+    device const uint16_t * q4 = (device const uint16_t *)xb->qs;
			
 
				+    const float d = xb->d;
			
 
				+    uint32_t aux32;
			
 
				+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
			
 
				+        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
			
 
				+        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
			
 
				+        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
			
 
				+        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template <typename type4x4>
			
 
				+void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
			
 
				+#if QK_K == 64
			
 
				+    dequantize_iq4_nl(xb, il, reg);
			
 
				+#else
			
 
				+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
			
 
				+    const int ib32 = il/2;
			
 
				+    il = il%2;
			
 
				+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
			
 
				+    device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
			
 
				+    const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
			
 
				+    const float d = (float)xb->d * (ls - 32);
			
 
				+    uint32_t aux32;
			
 
				+    thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
			
 
				+        reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
			
 
				+        reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
			
 
				+        reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
			
 
				+        reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
			
 
				+    }
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
			
 
				+kernel void kernel_get_rows(
			
 
				+        device const  void * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        uint3                tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                 tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint3                tptg [[threads_per_threadgroup]]) {
			
 
				+    //const int64_t i = tgpig;
			
 
				+    //const int64_t r = ((device int32_t *) src1)[i];
			
 
				+
			
 
				+    const int64_t i10 = tgpig.x;
			
 
				+    const int64_t i11 = tgpig.y;
			
 
				+
			
 
				+    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
			
 
				+
			
 
				+    const int64_t i02 = i11;
			
 
				+
			
 
				+    for (int64_t ind = tiitg; ind < ne00/16; ind += tptg.x) {
			
 
				+        float4x4 temp;
			
 
				+        dequantize_func(
			
 
				+            ((device const block_q *) ((device char *) src0 + r*nb01 + i02*nb02)) + ind/nl, ind%nl, temp);
			
 
				+        *(((device float4x4 *) ((device char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_get_rows_f32(
			
 
				+        device const  void * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        uint3                tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                 tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint3                tptg [[threads_per_threadgroup]]) {
			
 
				+    const int64_t i10 = tgpig.x;
			
 
				+    const int64_t i11 = tgpig.y;
			
 
				+
			
 
				+    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
			
 
				+
			
 
				+    const int64_t i02 = i11;
			
 
				+
			
 
				+    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
			
 
				+        ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
			
 
				+            ((device float *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_get_rows_f16(
			
 
				+        device const  void * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        uint3                tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                 tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint3                tptg [[threads_per_threadgroup]]) {
			
 
				+    const int64_t i10 = tgpig.x;
			
 
				+    const int64_t i11 = tgpig.y;
			
 
				+
			
 
				+    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
			
 
				+
			
 
				+    const int64_t i02 = i11;
			
 
				+
			
 
				+    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
			
 
				+        ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
			
 
				+            ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+kernel void kernel_get_rows_i32(
			
 
				+        device const  void * src0,
			
 
				+        device const  char * src1,
			
 
				+        device     int32_t * dst,
			
 
				+        constant   int64_t & ne00,
			
 
				+        constant  uint64_t & nb01,
			
 
				+        constant  uint64_t & nb02,
			
 
				+        constant   int64_t & ne10,
			
 
				+        constant  uint64_t & nb10,
			
 
				+        constant  uint64_t & nb11,
			
 
				+        constant  uint64_t & nb1,
			
 
				+        constant  uint64_t & nb2,
			
 
				+        uint3                tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                 tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint3                tptg [[threads_per_threadgroup]]) {
			
 
				+    const int64_t i10 = tgpig.x;
			
 
				+    const int64_t i11 = tgpig.y;
			
 
				+
			
 
				+    const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0];
			
 
				+
			
 
				+    const int64_t i02 = i11;
			
 
				+
			
 
				+    for (int ind = tiitg; ind < ne00; ind += tptg.x) {
			
 
				+        ((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] =
			
 
				+            ((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
			
 
				+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
			
 
				+#define BLOCK_SIZE_K 32
			
 
				+#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
			
 
				+#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
			
 
				+#define THREAD_PER_BLOCK 128
			
 
				+#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
			
 
				+#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
			
 
				+#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
			
 
				+#define SG_MAT_ROW 8
			
 
				+
			
 
				+// each block_q contains 16*nl weights
			
 
				+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
			
 
				+void kernel_mul_mm_impl(device const  uchar * src0,
			
 
				+                        device const  uchar * src1,
			
 
				+                        device        float * dst,
			
 
				+                        constant    int64_t & ne00,
			
 
				+                        constant    int64_t & ne02,
			
 
				+                        constant   uint64_t & nb01,
			
 
				+                        constant   uint64_t & nb02,
			
 
				+                        constant    int64_t & ne12,
			
 
				+                        constant   uint64_t & nb10,
			
 
				+                        constant   uint64_t & nb11,
			
 
				+                        constant   uint64_t & nb12,
			
 
				+                        constant    int64_t & ne0,
			
 
				+                        constant    int64_t & ne1,
			
 
				+                        constant       uint & r2,
			
 
				+                        constant       uint & r3,
			
 
				+                        threadgroup   uchar * shared_memory [[threadgroup(0)]],
			
 
				+                        uint3                 tgpig[[threadgroup_position_in_grid]],
			
 
				+                        uint                  tiitg[[thread_index_in_threadgroup]],
			
 
				+                        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
			
 
				+    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
			
 
				+
			
 
				+    const uint r0 = tgpig.y;
			
 
				+    const uint r1 = tgpig.x;
			
 
				+    const uint im = tgpig.z;
			
 
				+
			
 
				+    // if this block is of 64x32 shape or smaller
			
 
				+    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
			
 
				+    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
			
 
				+
			
 
				+    // a thread shouldn't load data outside of the matrix
			
 
				+    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
			
 
				+    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
			
 
				+
			
 
				+    simdgroup_half8x8  ma[4];
			
 
				+    simdgroup_float8x8 mb[2];
			
 
				+    simdgroup_float8x8 c_res[8];
			
 
				+    for (int i = 0; i < 8; i++){
			
 
				+        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
			
 
				+    }
			
 
				+
			
 
				+    short il = (tiitg % THREAD_PER_ROW);
			
 
				+
			
 
				+    const uint i12 = im%ne12;
			
 
				+    const uint i13 = im/ne12;
			
 
				+
			
 
				+    uint   offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02);
			
 
				+    ushort offset1 = il/nl;
			
 
				+
			
 
				+    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
			
 
				+    device const float   * y = (device const float   *)(src1
			
 
				+        + nb12 * im
			
 
				+        + nb11 * (r1 * BLOCK_SIZE_N + thread_col)
			
 
				+        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
			
 
				+
			
 
				+    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
			
 
				+        // load data and store to threadgroup memory
			
 
				+        half4x4 temp_a;
			
 
				+        dequantize_func(x, il, temp_a);
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        #pragma unroll(16)
			
 
				+        for (int i = 0; i < 16; i++) {
			
 
				+            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
			
 
				+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
			
 
				+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
			
 
				+        }
			
 
				+
			
 
				+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
			
 
				+
			
 
				+        il = (il + 2 < nl) ? il + 2 : il % 2;
			
 
				+        x  = (il < 2) ? x + (2+nl-1)/nl : x;
			
 
				+        y += BLOCK_SIZE_K;
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        // load matrices from threadgroup memory and conduct outer products
			
 
				+        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
			
 
				+        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
			
 
				+
			
 
				+        #pragma unroll(4)
			
 
				+        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
			
 
				+            #pragma unroll(4)
			
 
				+            for (int i = 0; i < 4; i++) {
			
 
				+                simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i);
			
 
				+            }
			
 
				+            simdgroup_barrier(mem_flags::mem_none);
			
 
				+            #pragma unroll(2)
			
 
				+            for (int i = 0; i < 2; i++) {
			
 
				+                simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i);
			
 
				+            }
			
 
				+
			
 
				+            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
			
 
				+            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
			
 
				+
			
 
				+            #pragma unroll(8)
			
 
				+            for (int i = 0; i < 8; i++){
			
 
				+                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
			
 
				+        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
			
 
				+                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
			
 
				+        for (int i = 0; i < 8; i++) {
			
 
				+            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
			
 
				+        }
			
 
				+    } else {
			
 
				+        // block is smaller than 64x32, we should avoid writing data outside of the matrix
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
			
 
				+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
			
 
				+        for (int i = 0; i < 8; i++) {
			
 
				+            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
			
 
				+        if (sgitg == 0) {
			
 
				+            for (int i = 0; i < n_rows; i++) {
			
 
				+                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
			
 
				+                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
			
 
				+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
			
 
				+void kernel_mul_mm_id_impl(
			
 
				+        device const  uchar * src0,
			
 
				+        device const  uchar * src1,
			
 
				+        threadgroup ushort2 * rowids,
			
 
				+        device        float * dst,
			
 
				+        constant    int64_t & ne00,
			
 
				+        constant    int64_t & ne02,
			
 
				+        constant   uint64_t & nb01,
			
 
				+        constant   uint64_t & nb02,
			
 
				+        constant    int64_t & ne11,
			
 
				+        constant    int64_t & ne12,
			
 
				+        constant   uint64_t & nb10,
			
 
				+        constant   uint64_t & nb11,
			
 
				+        constant   uint64_t & nb12,
			
 
				+        constant    int64_t & ne0,
			
 
				+                    int64_t   ne1,
			
 
				+                    int64_t   ne0ne1,
			
 
				+        threadgroup   uchar * shared_memory,
			
 
				+        uint3                 tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                  tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    threadgroup half  * sa = (threadgroup half  *)(shared_memory);
			
 
				+    threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
			
 
				+
			
 
				+    const uint r0 = tgpig.y;
			
 
				+    const uint r1 = tgpig.x;
			
 
				+
			
 
				+    if (r1 * BLOCK_SIZE_N >= ne1) return;
			
 
				+
			
 
				+    // if this block is of 64x32 shape or smaller
			
 
				+    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
			
 
				+    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
			
 
				+
			
 
				+    // a thread shouldn't load data outside of the matrix
			
 
				+    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
			
 
				+    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
			
 
				+
			
 
				+    simdgroup_half8x8  ma[4];
			
 
				+    simdgroup_float8x8 mb[2];
			
 
				+    simdgroup_float8x8 c_res[8];
			
 
				+    for (int i = 0; i < 8; i++){
			
 
				+        c_res[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
			
 
				+    }
			
 
				+    short il = (tiitg % THREAD_PER_ROW);
			
 
				+
			
 
				+    ushort offset1 = il/nl;
			
 
				+
			
 
				+    threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];
			
 
				+
			
 
				+    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
			
 
				+    device const float   * y = (device const float   *)(src1
			
 
				+        + nb12 * id[1]
			
 
				+        + nb11 * (id[0] % ne11)
			
 
				+        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
			
 
				+
			
 
				+    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
			
 
				+        // load data and store to threadgroup memory
			
 
				+        half4x4 temp_a;
			
 
				+        dequantize_func(x, il, temp_a);
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        for (int i = 0; i < 16; i++) {
			
 
				+            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
			
 
				+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
			
 
				+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
			
 
				+        }
			
 
				+
			
 
				+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
			
 
				+
			
 
				+        il = (il + 2 < nl) ? il + 2 : il % 2;
			
 
				+        x  = (il < 2) ? x + (2+nl-1)/nl : x;
			
 
				+        y += BLOCK_SIZE_K;
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        // load matrices from threadgroup memory and conduct outer products
			
 
				+        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
			
 
				+        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
			
 
				+
			
 
				+        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
			
 
				+            for (int i = 0; i < 4; i++) {
			
 
				+                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
			
 
				+            }
			
 
				+            simdgroup_barrier(mem_flags::mem_none);
			
 
				+            for (int i = 0; i < 2; i++) {
			
 
				+                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
			
 
				+            }
			
 
				+
			
 
				+            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
			
 
				+            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
			
 
				+
			
 
				+            for (int i = 0; i < 8; i++){
			
 
				+                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    {
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
			
 
				+                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
			
 
				+        for (int i = 0; i < 8; i++) {
			
 
				+            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
			
 
				+        }
			
 
				+
			
 
				+        threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+        device float * C = dst + (BLOCK_SIZE_M * r0);
			
 
				+        if (sgitg == 0) {
			
 
				+            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
			
 
				+                threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
			
 
				+                int joff =  jid[0] * ne0 + jid[1] * ne0ne1;
			
 
				+                for (int i = 0; i < n_rows; i++) {
			
 
				+                    *(C + i + joff) = *(temp_str + i + j * BLOCK_SIZE_M);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
			
 
				+kernel void kernel_mul_mm(device const  uchar * src0,
			
 
				+                          device const  uchar * src1,
			
 
				+                          device        float * dst,
			
 
				+                          constant    int64_t & ne00,
			
 
				+                          constant    int64_t & ne02,
			
 
				+                          constant   uint64_t & nb01,
			
 
				+                          constant   uint64_t & nb02,
			
 
				+                          constant    int64_t & ne12,
			
 
				+                          constant   uint64_t & nb10,
			
 
				+                          constant   uint64_t & nb11,
			
 
				+                          constant   uint64_t & nb12,
			
 
				+                          constant    int64_t & ne0,
			
 
				+                          constant    int64_t & ne1,
			
 
				+                          constant       uint & r2,
			
 
				+                          constant       uint & r3,
			
 
				+                          threadgroup   uchar * shared_memory [[threadgroup(0)]],
			
 
				+                          uint3                 tgpig[[threadgroup_position_in_grid]],
			
 
				+                          uint                  tiitg[[thread_index_in_threadgroup]],
			
 
				+                          uint                  sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    kernel_mul_mm_impl<block_q, nl, dequantize_func>(
			
 
				+        src0,
			
 
				+        src1,
			
 
				+        dst,
			
 
				+        ne00,
			
 
				+        ne02,
			
 
				+        nb01,
			
 
				+        nb02,
			
 
				+        ne12,
			
 
				+        nb10,
			
 
				+        nb11,
			
 
				+        nb12,
			
 
				+        ne0,
			
 
				+        ne1,
			
 
				+        r2,
			
 
				+        r3,
			
 
				+        shared_memory,
			
 
				+        tgpig,
			
 
				+        tiitg,
			
 
				+        sgitg);
			
 
				+}
			
 
				+
			
 
				+template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
			
 
				+kernel void kernel_mul_mm_id(
			
 
				+        device const   uchar * src0s,
			
 
				+        device const   uchar * src1,
			
 
				+        device         float * dst,
			
 
				+        device const   uchar * ids,
			
 
				+        constant     int64_t & nei0,
			
 
				+        constant     int64_t & nei1,
			
 
				+        constant    uint64_t & nbi1,
			
 
				+        constant     int64_t & ne00,
			
 
				+        constant     int64_t & ne02,
			
 
				+        constant    uint64_t & nb01,
			
 
				+        constant    uint64_t & nb02,
			
 
				+        constant     int64_t & ne11,
			
 
				+        constant     int64_t & ne12,
			
 
				+        constant     int64_t & ne13,
			
 
				+        constant    uint64_t & nb10,
			
 
				+        constant    uint64_t & nb11,
			
 
				+        constant    uint64_t & nb12,
			
 
				+        constant     int64_t & ne0,
			
 
				+        constant     int64_t & ne1,
			
 
				+        constant    uint64_t & nb1,
			
 
				+        threadgroup    uchar * shared_memory [[threadgroup(0)]],
			
 
				+        uint3                  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                   tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+
			
 
				+    const int32_t i02 = tgpig.z;
			
 
				+    tgpig.z = 0;
			
 
				+
			
 
				+    device const uchar * src0 = src0s + i02*nb02;
			
 
				+
			
 
				+    // row indices
			
 
				+    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shared_memory + 8192);
			
 
				+
			
 
				+    // TODO: parallelize this loop
			
 
				+    int64_t _ne1 = 0;
			
 
				+    for (ushort ii1 = 0; ii1 < nei1; ii1++) {
			
 
				+        for (ushort ii0 = 0; ii0 < nei0; ii0++) {
			
 
				+            int32_t id = ((device int32_t *) (ids + ii1*nbi1))[ii0];
			
 
				+            if (id == i02) {
			
 
				+                //if (tiitg == 0) {
			
 
				+                    rowids[_ne1] = ushort2(ii0, ii1);
			
 
				+                //}
			
 
				+                _ne1++;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    threadgroup_barrier(mem_flags::mem_threadgroup);
			
 
				+
			
 
				+    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
			
 
				+        src0,
			
 
				+        src1,
			
 
				+        rowids,
			
 
				+        dst,
			
 
				+        ne00,
			
 
				+        ne02,
			
 
				+        nb01,
			
 
				+        nb02,
			
 
				+        ne11,
			
 
				+        ne12,
			
 
				+        nb10,
			
 
				+        nb11,
			
 
				+        nb12,
			
 
				+        ne0,
			
 
				+        _ne1,
			
 
				+        ne0*ne1,
			
 
				+        shared_memory,
			
 
				+        tgpig,
			
 
				+        tiitg,
			
 
				+        sgitg);
			
 
				+}
			
 
				+
			
 
				+#if QK_K == 256
			
 
				+#define QK_NL 16
			
 
				+#else
			
 
				+#define QK_NL 4
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// get rows
			
 
				+//
			
 
				+
			
 
				+typedef void (get_rows_t)(
			
 
				+        device const void * src0,
			
 
				+        device const char * src1,
			
 
				+        device      float * dst,
			
 
				+        constant  int64_t & ne00,
			
 
				+        constant uint64_t & nb01,
			
 
				+        constant uint64_t & nb02,
			
 
				+        constant  int64_t & ne10,
			
 
				+        constant uint64_t & nb10,
			
 
				+        constant uint64_t & nb11,
			
 
				+        constant uint64_t & nb1,
			
 
				+        constant uint64_t & nb2,
			
 
				+        uint3, uint, uint3);
			
 
				+
			
 
				+//template [[host_name("kernel_get_rows_f32")]]  kernel get_rows_t kernel_get_rows<float4x4,   1, dequantize_f32>;
			
 
				+//template [[host_name("kernel_get_rows_f16")]]  kernel get_rows_t kernel_get_rows<half4x4,    1, dequantize_f16>;
			
 
				+template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
			
 
				+template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
			
 
				+template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_t kernel_get_rows<block_q5_0, 2, dequantize_q5_0>;
			
 
				+template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_t kernel_get_rows<block_q5_1, 2, dequantize_q5_1>;
			
 
				+template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_t kernel_get_rows<block_q8_0, 2, dequantize_q8_0>;
			
 
				+template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows<block_q2_K, QK_NL, dequantize_q2_K>;
			
 
				+template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows<block_q3_K, QK_NL, dequantize_q3_K>;
			
 
				+template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows<block_q4_K, QK_NL, dequantize_q4_K>;
			
 
				+template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
			
 
				+template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
			
 
				+template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
			
 
				+template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_t kernel_get_rows<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
			
 
				+template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
			
 
				+template [[host_name("kernel_get_rows_iq3_s")]]   kernel get_rows_t kernel_get_rows<block_iq3_s,   QK_NL, dequantize_iq3_s>;
			
 
				+template [[host_name("kernel_get_rows_iq2_s")]]   kernel get_rows_t kernel_get_rows<block_iq2_s,   QK_NL, dequantize_iq2_s>;
			
 
				+template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
			
 
				+template [[host_name("kernel_get_rows_iq1_m")]]   kernel get_rows_t kernel_get_rows<block_iq1_m,   QK_NL, dequantize_iq1_m>;
			
 
				+template [[host_name("kernel_get_rows_iq4_nl")]]  kernel get_rows_t kernel_get_rows<block_iq4_nl,  2,     dequantize_iq4_nl>;
			
 
				+#if QK_K == 64
			
 
				+template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  2,     dequantize_iq4_xs>;
			
 
				+#else
			
 
				+template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_t kernel_get_rows<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// matrix-matrix multiplication
			
 
				+//
			
 
				+
			
 
				+typedef decltype(kernel_mul_mm<float4x4, 1, dequantize_f32>) mat_mm_t;
			
 
				+
			
 
				+template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<float4x4,      1,     dequantize_f32>;
			
 
				+template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half4x4,       1,     dequantize_f16>;
			
 
				+template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q4_0,    2,     dequantize_q4_0>;
			
 
				+template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q4_1,    2,     dequantize_q4_1>;
			
 
				+template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q5_0,    2,     dequantize_q5_0>;
			
 
				+template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q5_1,    2,     dequantize_q5_1>;
			
 
				+template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q8_0,    2,     dequantize_q8_0>;
			
 
				+template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q2_K,    QK_NL, dequantize_q2_K>;
			
 
				+template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q3_K,    QK_NL, dequantize_q3_K>;
			
 
				+template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q4_K,    QK_NL, dequantize_q4_K>;
			
 
				+template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q5_K,    QK_NL, dequantize_q5_K>;
			
 
				+template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mat_mm_t kernel_mul_mm<block_q6_K,    QK_NL, dequantize_q6_K>;
			
 
				+template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
			
 
				+template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
			
 
				+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
			
 
				+template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq3_s,   QK_NL, dequantize_iq3_s>;
			
 
				+template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq2_s,   QK_NL, dequantize_iq2_s>;
			
 
				+template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
			
 
				+template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_m,   QK_NL, dequantize_iq1_m>;
			
 
				+template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_nl>;
			
 
				+#if QK_K == 64
			
 
				+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_nl,  2,     dequantize_iq4_xs>;
			
 
				+#else
			
 
				+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// indirect matrix-matrix multiplication
			
 
				+//
			
 
				+
			
 
				+typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
			
 
				+
			
 
				+template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
			
 
				+template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
			
 
				+template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
			
 
				+template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
			
 
				+template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
			
 
				+template [[host_name("kernel_mul_mm_id_q5_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1,    2,     dequantize_q5_1>;
			
 
				+template [[host_name("kernel_mul_mm_id_q8_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0,    2,     dequantize_q8_0>;
			
 
				+template [[host_name("kernel_mul_mm_id_q2_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K,    QK_NL, dequantize_q2_K>;
			
 
				+template [[host_name("kernel_mul_mm_id_q3_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K,    QK_NL, dequantize_q3_K>;
			
 
				+template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K,    QK_NL, dequantize_q4_K>;
			
 
				+template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K,    QK_NL, dequantize_q5_K>;
			
 
				+template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K,    QK_NL, dequantize_q6_K>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
			
 
				+template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
			
 
				+#if QK_K == 64
			
 
				+template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  2,     dequantize_iq4_xs>;
			
 
				+#else
			
 
				+template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// matrix-vector multiplication
			
 
				+//
			
 
				+
			
 
				+typedef void (kernel_mul_mv_impl_t)(
			
 
				+        device const  char * src0,
			
 
				+        device const  char * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                  uint64_t   nb00,
			
 
				+                  uint64_t   nb01,
			
 
				+                  uint64_t   nb02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne11,
			
 
				+                   int64_t   ne12,
			
 
				+                  uint64_t   nb10,
			
 
				+                  uint64_t   nb11,
			
 
				+                  uint64_t   nb12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg);
			
 
				+
			
 
				+typedef void (kernel_mul_mv2_impl_t)(
			
 
				+        device const  void * src0,
			
 
				+        device const float * src1,
			
 
				+        device       float * dst,
			
 
				+                   int64_t   ne00,
			
 
				+                   int64_t   ne01,
			
 
				+                   int64_t   ne02,
			
 
				+                   int64_t   ne10,
			
 
				+                   int64_t   ne12,
			
 
				+                   int64_t   ne0,
			
 
				+                   int64_t   ne1,
			
 
				+                   uint      r2,
			
 
				+                   uint      r3,
			
 
				+        threadgroup int8_t * shared_values,
			
 
				+                   uint3     tgpig,
			
 
				+                   uint      tiisg,
			
 
				+                   uint      sgitg);
			
 
				+
			
 
				+template<kernel_mul_mv_impl_t impl_fn>
			
 
				+void mmv_fn(
			
 
				+        device const    char * src0,
			
 
				+        device const    char * src1,
			
 
				+        device         float * dst,
			
 
				+                     int64_t   ne00,
			
 
				+                     int64_t   ne01,
			
 
				+                     int64_t   ne02,
			
 
				+                    uint64_t   nb00,
			
 
				+                    uint64_t   nb01,
			
 
				+                    uint64_t   nb02,
			
 
				+                     int64_t   ne10,
			
 
				+                     int64_t   ne11,
			
 
				+                     int64_t   ne12,
			
 
				+                     int64_t   ne13,
			
 
				+                    uint64_t   nb10,
			
 
				+                    uint64_t   nb11,
			
 
				+                    uint64_t   nb12,
			
 
				+                     int64_t   ne0,
			
 
				+                     int64_t   ne1,
			
 
				+                    uint64_t   nb1,
			
 
				+                        uint   r2,
			
 
				+                        uint   r3,
			
 
				+        threadgroup int8_t   * shared_values,
			
 
				+        uint3                  tgpig,
			
 
				+        uint                   tiitg,
			
 
				+        uint                   tiisg,
			
 
				+        uint                   sgitg) {
			
 
				+    impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,ne10,ne11,ne12,nb10,nb11,nb12,ne0,ne1,r2,r3,tgpig,tiisg);
			
 
				+}
			
 
				+
			
 
				+template<kernel_mul_mv2_impl_t impl_fn>
			
 
				+void mmv_fn(
			
 
				+        device const    char * src0,
			
 
				+        device const    char * src1,
			
 
				+        device         float * dst,
			
 
				+                     int64_t   ne00,
			
 
				+                     int64_t   ne01,
			
 
				+                     int64_t   ne02,
			
 
				+                    uint64_t   nb00,
			
 
				+                    uint64_t   nb01,
			
 
				+                    uint64_t   nb02,
			
 
				+                     int64_t   ne10,
			
 
				+                     int64_t   ne11,
			
 
				+                     int64_t   ne12,
			
 
				+                     int64_t   ne13,
			
 
				+                    uint64_t   nb10,
			
 
				+                    uint64_t   nb11,
			
 
				+                    uint64_t   nb12,
			
 
				+                     int64_t   ne0,
			
 
				+                     int64_t   ne1,
			
 
				+                    uint64_t   nb1,
			
 
				+                        uint   r2,
			
 
				+                        uint   r3,
			
 
				+        threadgroup int8_t   * shared_values,
			
 
				+        uint3                  tgpig,
			
 
				+        uint                   tiitg,
			
 
				+        uint                   tiisg,
			
 
				+        uint                   sgitg) {
			
 
				+    impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg);
			
 
				+}
			
 
				+
			
 
				+typedef decltype(mmv_fn<kernel_mul_mv_f32_f32_impl>) mul_mv_impl_fn_t;
			
 
				+
			
 
				+template<mul_mv_impl_fn_t impl_fn>
			
 
				+kernel void kernel_mul_mv_id(
			
 
				+        device const    char * src0s,
			
 
				+        device const    char * src1,
			
 
				+        device         float * dst,
			
 
				+        device const    char * ids,
			
 
				+        constant     int64_t & nei0,
			
 
				+        constant     int64_t & nei1,
			
 
				+        constant    uint64_t & nbi1,
			
 
				+        constant     int64_t & ne00,
			
 
				+        constant     int64_t & ne01,
			
 
				+        constant     int64_t & ne02,
			
 
				+        constant    uint64_t & nb00,
			
 
				+        constant    uint64_t & nb01,
			
 
				+        constant    uint64_t & nb02,
			
 
				+        constant     int64_t & ne10,
			
 
				+        constant     int64_t & ne11,
			
 
				+        constant     int64_t & ne12,
			
 
				+        constant     int64_t & ne13,
			
 
				+        constant    uint64_t & nb10,
			
 
				+        constant    uint64_t & nb11,
			
 
				+        constant    uint64_t & nb12,
			
 
				+        constant     int64_t & ne0,
			
 
				+        constant     int64_t & ne1,
			
 
				+        constant    uint64_t & nb1,
			
 
				+        threadgroup int8_t   * shared_values [[threadgroup(0)]],
			
 
				+        uint3                  tgpig[[threadgroup_position_in_grid]],
			
 
				+        uint                   tiitg[[thread_index_in_threadgroup]],
			
 
				+        uint                   tiisg[[thread_index_in_simdgroup]],
			
 
				+        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
			
 
				+    const int iid1 = tgpig.z/nei0;
			
 
				+    const int idx = tgpig.z%nei0;
			
 
				+
			
 
				+    tgpig.z = 0;
			
 
				+
			
 
				+    const int32_t i02 = ((device const int32_t *) (ids + iid1*nbi1))[idx];
			
 
				+
			
 
				+    const int64_t i11 = idx % ne11;
			
 
				+    const int64_t i12 = iid1;
			
 
				+
			
 
				+    const int64_t i1 = idx;
			
 
				+    const int64_t i2 = i12;
			
 
				+
			
 
				+    device const char * src0_cur = src0s + i02*nb02;
			
 
				+    device const char * src1_cur = src1 + i11*nb11 + i12*nb12;
			
 
				+    device      float * dst_cur  = dst + i1*ne0 + i2*ne1*ne0;
			
 
				+
			
 
				+    impl_fn(
			
 
				+        /* src0 */ src0_cur,
			
 
				+        /* src1 */ src1_cur,
			
 
				+        /* dst  */ dst_cur,
			
 
				+        /* ne00 */ ne00,
			
 
				+        /* ne01 */ ne01,
			
 
				+        /* ne02 */ 1,//ne02,
			
 
				+        /* nb00 */ nb00,
			
 
				+        /* nb01 */ nb01,
			
 
				+        /* nb02 */ nb02,
			
 
				+        /* ne10 */ ne10,
			
 
				+        /* ne11 */ 1,//ne11,
			
 
				+        /* ne12 */ 1,//ne12,
			
 
				+        /* ne13 */ 1,//ne13,
			
 
				+        /* nb10 */ nb10,
			
 
				+        /* nb11 */ nb11,
			
 
				+        /* nb12 */ nb12,
			
 
				+        /* ne0  */ ne0,
			
 
				+        /* ne1  */ 1,//ne1,
			
 
				+        /* nb1  */ nb1,
			
 
				+        /* r2   */ 1,
			
 
				+        /* r3   */ 1,
			
 
				+        shared_values,
			
 
				+        tgpig,
			
 
				+        tiitg,
			
 
				+        tiisg,
			
 
				+        sgitg);
			
 
				+}
			
 
				+
			
 
				+typedef decltype(kernel_mul_mv_id<mmv_fn<kernel_mul_mv_f32_f32_impl>>) kernel_mul_mv_id_t;
			
 
				+
			
 
				+template [[host_name("kernel_mul_mv_id_f32_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_f32_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_f16_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_f16_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q8_0_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q5_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<mul_vec_q_n_f32_impl<block_q5_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q2_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q2_K_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q3_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q3_K_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q4_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q4_K_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q5_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq1_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_s_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq1_m_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_m_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_xxs_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq2_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_xs_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_xxs_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq3_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq2_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
			
 
				+template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
			
 
				+#if QK_K != 64
			
 
				+template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]]  kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
			
 
				+#endif
			
 
				+
			
--- a/llama/ggml-metal.o
+++ b/llama/ggml-metal.o
--- a/llama/ggml-quants.h
+++ b/llama/ggml-quants.h
@@ -1,133 +1,133 @@
 
				-#pragma once

			
 
				-

			
 
				-#define GGML_COMMON_DECL_C

			
 
				-#include "ggml-common.h"

			
 
				-

			
 
				-#include "ggml.h"

			
 
				-

			
 
				-// GGML internal header

			
 
				-

			
 
				-#ifdef __cplusplus

			
 
				-extern "C" {

			
 
				-#endif

			
 
				-

			
 
				-// Quantization

			
 
				-void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-// Dequantization

			
 
				-void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);

			
 
				-

			
 
				-// Dot product

			
 
				-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-

			
 
				-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-

			
 
				-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);

			
 
				-

			
 
				-// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")

			
 
				-size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-

			
 
				-size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);

			
 
				-

			
 
				-void iq2xs_init_impl(enum ggml_type type);

			
 
				-void iq2xs_free_impl(enum ggml_type type);

			
 
				-void iq3xs_init_impl(int grid_size);

			
 
				-void iq3xs_free_impl(int grid_size);

			
 
				-

			
 
				-#ifdef __cplusplus

			
 
				-}

			
 
				-#endif

			
 
				-

			
 
				+#pragma once
			
 
				+
			
 
				+#define GGML_COMMON_DECL_C
			
 
				+#include "ggml-common.h"
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+
			
 
				+// GGML internal header
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+// Quantization
			
 
				+void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+// Dequantization
			
 
				+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
			
 
				+
			
 
				+// Dot product
			
 
				+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+
			
 
				+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+
			
 
				+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
			
 
				+
			
 
				+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
			
 
				+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+
			
 
				+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
			
 
				+
			
 
				+void iq2xs_init_impl(enum ggml_type type);
			
 
				+void iq2xs_free_impl(enum ggml_type type);
			
 
				+void iq3xs_init_impl(int grid_size);
			
 
				+void iq3xs_free_impl(int grid_size);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -3,14 +3,13 @@ package llama
 
				 // #cgo darwin,arm64 CFLAGS: -std=c11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
			
 
				 // #cgo darwin,arm64 CXXFLAGS: -std=c++11 -DGGML_USE_METAL -DGGML_METAL_EMBED_LIBRARY -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
			
 
				 // #cgo darwin,amd64 CXXFLAGS: -std=c++11
			
 
				-// #cgo darwin,arm64 LDFLAGS: ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
			
 
				-// #cgo darwin,amd64 LDFLAGS: -framework Foundation -framework Accelerate
			
 
				+// #cgo darwin,arm64 LDFLAGS: -ld_classic ${SRCDIR}/ggml-metal.o -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
			
 
				+// #cgo darwin,amd64 LDFLAGS: -ld_classic -framework Foundation -framework Accelerate
			
 
				 // #cgo windows LDFLAGS: -lmsvcrt
			
 
				 // #cgo avx CFLAGS: -mavx
			
 
				 // #cgo avx CXXFLAGS: -mavx
			
 
				 // #cgo avx2 CFLAGS: -mavx -mavx2 -mfma
			
 
				 // #cgo avx2 CXXFLAGS: -mavx -mavx2 -mfma
			
 
				-// #cgo avx2 LDFLAGS: -lm
			
 
				 // #cgo cuda CFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
			
 
				 // #cgo cuda CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
			
 
				 // #cgo rocm CXXFLAGS: -std=c++11 -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_MULTIPLATFORM -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
			
@@ -24,6 +23,8 @@ import (
 
				 	"runtime"
			
 
				 	"strings"
			
 
				 	"unsafe"
			
 
				+
			
 
				+	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
 
				 type Token int32
			
@@ -201,3 +202,21 @@ func (m *Model) Tokenize(text string, maxTokens int, addSpecial bool, parseSpeci
 
				 
			
 
				 	return tokens, nil
			
 
				 }
			
 
				+
			
 
				+func Quantize(infile, outfile string, ftype llm.FileType) error {
			
 
				+	cinfile := C.CString(infile)
			
 
				+	defer C.free(unsafe.Pointer(cinfile))
			
 
				+
			
 
				+	coutfile := C.CString(outfile)
			
 
				+	defer C.free(unsafe.Pointer(coutfile))
			
 
				+
			
 
				+	params := C.llama_model_quantize_default_params()
			
 
				+	params.nthread = -1
			
 
				+	params.ftype = ftype.Value()
			
 
				+
			
 
				+	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
			
 
				+		return fmt.Errorf("llama_model_quantize: %d", rc)
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
--- a/llama/metal.sh
+++ b/llama/metal.sh
@@ -1,11 +0,0 @@
 
				-sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
			
 
				-TEMP_ASSEMBLY=$(mktemp)
			
 
				-echo ".section __DATA, __ggml_metallib"   >  $TEMP_ASSEMBLY
			
 
				-echo ".globl _ggml_metallib_start"        >> $TEMP_ASSEMBLY
			
 
				-echo "_ggml_metallib_start:"              >> $TEMP_ASSEMBLY
			
 
				-echo ".incbin \"ggml-metal-embed.metal\"" >> $TEMP_ASSEMBLY
			
 
				-echo ".globl _ggml_metallib_end"          >> $TEMP_ASSEMBLY
			
 
				-echo "_ggml_metallib_end:"                >> $TEMP_ASSEMBLY
			
 
				-as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
			
 
				-rm -f $TEMP_ASSEMBLY
			
 
				-rm -rf ggml-metal-embed.metal
			
--- a/llama/runner/README.md
+++ b/llama/runner/README.md
@@ -5,5 +5,5 @@
 
				 ```
			
 
				 
			
 
				 ```
			
 
				-curl POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
			
 
				+curl -X POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/
			
 
				 ```
			
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -23,29 +23,9 @@ type Response struct {
 
				 type Server struct {
			
 
				 	model *llama.Model
			
 
				 	lc    *llama.Context
			
 
				-	batch *llama.Batch
			
 
				-
			
 
				-	queue chan Sequence
			
 
				-	seqs  []*Sequence
			
 
				-
			
 
				-	// mu guards seqs
			
 
				-	mu sync.Mutex
			
 
				-}
			
 
				-
			
 
				-type Sequence struct {
			
 
				-	prompt []llama.Token
			
 
				-	out    chan string
			
 
				 }
			
 
				 
			
 
				-func schedule(parallel int, queue <-chan Sequence) {
			
 
				-	// Fill sequences from the queue
			
 
				-
			
 
				-	// once a sequence finishes, remove it from and add a new one from the queue
			
 
				-}
			
 
				-
			
 
				-func process() {
			
 
				-	// loop through the sequences, fill a batch, decode and sample tokens, responding to appropriate requests
			
 
				-}
			
 
				+var mu sync.Mutex
			
 
				 
			
 
				 func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
			
 
				 	var request Request
			
@@ -59,23 +39,15 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
 
				 	w.Header().Set("Transfer-Encoding", "chunked")
			
 
				 	w.WriteHeader(http.StatusOK)
			
 
				 
			
 
				+	enc := json.NewEncoder(w)
			
 
				+
			
 
				+	// main loop
			
 
				 	tokens, err := s.model.Tokenize(request.Prompt, 2048, true, true)
			
 
				 	if err != nil {
			
 
				 		panic(err)
			
 
				 	}
			
 
				 
			
 
				-	seq := Sequence{prompt: tokens}
			
 
				-	s.queue <- seq
			
 
				-
			
 
				-	// listen for the sequence to finish
			
 
				-	for {
			
 
				-		str := <-seq.out
			
 
				-		if err := json.NewEncoder(w).Encode(&Response{Token: str}); err != nil {
			
 
				-			log.Println("Failed to encode result:", err)
			
 
				-			return
			
 
				-		}
			
 
				-		w.(http.Flusher).Flush()
			
 
				-	}
			
 
				+	batch := llama.NewBatch(512, 0, 1)
			
 
				 
			
 
				 	// prompt eval
			
 
				 	for i, t := range tokens {
			
@@ -115,7 +87,6 @@ func (s *Server) stream(w http.ResponseWriter, r *http.Request) {
 
				 
			
 
				 func main() {
			
 
				 	mp := flag.String("model", "", "Path to model binary file")
			
 
				-	parallel := flag.Int("parallel", 1, "Number of parallel requests to handle")
			
 
				 	flag.Parse()
			
 
				 
			
 
				 	// load the model
			
@@ -131,8 +102,6 @@ func main() {
 
				 	server := &Server{
			
 
				 		model: model,
			
 
				 		lc:    lc,
			
 
				-		queue: make(chan Sequence, 256),
			
 
				-		seqs:  make([]*Sequence, *parallel),
			
 
				 	}
			
 
				 
			
 
				 	addr := "127.0.0.1:8080"
			
--- a/llama/sync.sh
+++ b/llama/sync.sh
@@ -23,7 +23,7 @@ cp $src_dir/ggml-quants.c $dst_dir/ggml-quants.c
 
				 cp $src_dir/ggml-quants.h $dst_dir/ggml-quants.h
			
 
				 cp $src_dir/ggml-metal.metal $dst_dir/ggml-metal.metal
			
 
				 cp $src_dir/ggml-metal.h $dst_dir/ggml-metal.h
			
 
				-cp $src_dir/ggml-metal.m $dst_dir/ggml-metal-darwin_arm64.m
			
 
				+cp $src_dir/ggml-metal.m $dst_dir/ggml-metal.m
			
 
				 cp $src_dir/ggml-impl.h $dst_dir/ggml-impl.h
			
 
				 cp $src_dir/ggml-cuda.h $dst_dir/ggml-cuda.h
			
 
				 cp $src_dir/ggml-cuda.cu $dst_dir/ggml-cuda.cu
			
@@ -34,11 +34,23 @@ cp $src_dir/ggml-backend-impl.h $dst_dir/ggml-backend-impl.h
 
				 cp $src_dir/ggml-alloc.h $dst_dir/ggml-alloc.h
			
 
				 cp $src_dir/ggml-alloc.c $dst_dir/ggml-alloc.c
			
 
				 
			
 
				-sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
			
 
				-sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
			
 
				-
			
 
				-
			
 
				 # ggml-cuda
			
 
				 mkdir -p $dst_dir/ggml-cuda
			
 
				 cp $src_dir/ggml-cuda/*.cu $dst_dir/ggml-cuda/
			
 
				-cp $src_dir/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
			
 
				+cp $src_dir/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
			
 
				+
			
 
				+sed -i 's/extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/\/\/ extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();/' ggml-cuda.cu
			
 
				+sed -i '34iGGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);' ggml-cuda.h
			
 
				+
			
 
				+# ggml-metal
			
 
				+sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > temp.metal
			
 
				+TEMP_ASSEMBLY=$(mktemp)
			
 
				+echo ".section __DATA, __ggml_metallib"   >  $TEMP_ASSEMBLY
			
 
				+echo ".globl _ggml_metallib_start"        >> $TEMP_ASSEMBLY
			
 
				+echo "_ggml_metallib_start:"              >> $TEMP_ASSEMBLY
			
 
				+echo ".incbin \"temp.metal\"" >> $TEMP_ASSEMBLY
			
 
				+echo ".globl _ggml_metallib_end"          >> $TEMP_ASSEMBLY
			
 
				+echo "_ggml_metallib_end:"                >> $TEMP_ASSEMBLY
			
 
				+as -mmacosx-version-min=11.3 $TEMP_ASSEMBLY -o ggml-metal.o
			
 
				+rm -f $TEMP_ASSEMBLY
			
 
				+rm -rf temp.metal
			
--- a/llm/filetype.go
+++ b/llm/filetype.go
@@ -2,10 +2,10 @@ package llm
 
				 
			
 
				 import "fmt"
			
 
				 
			
 
				-type fileType uint32
			
 
				+type FileType uint32
			
 
				 
			
 
				 const (
			
 
				-	fileTypeF32 fileType = iota
			
 
				+	fileTypeF32 FileType = iota
			
 
				 	fileTypeF16
			
 
				 	fileTypeQ4_0
			
 
				 	fileTypeQ4_1
			
@@ -41,7 +41,7 @@ const (
 
				 	fileTypeUnknown
			
 
				 )
			
 
				 
			
 
				-func ParseFileType(s string) (fileType, error) {
			
 
				+func ParseFileType(s string) (FileType, error) {
			
 
				 	switch s {
			
 
				 	case "F32":
			
 
				 		return fileTypeF32, nil
			
@@ -108,7 +108,7 @@ func ParseFileType(s string) (fileType, error) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (t fileType) String() string {
			
 
				+func (t FileType) String() string {
			
 
				 	switch t {
			
 
				 	case fileTypeF32:
			
 
				 		return "F32"
			
@@ -175,6 +175,6 @@ func (t fileType) String() string {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (t fileType) Value() uint32 {
			
 
				+func (t FileType) Value() uint32 {
			
 
				 	return uint32(t)
			
 
				 }
			
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -100,4 +100,4 @@ esac
 
				 
			
 
				 cleanup
			
 
				 wait_for_compress
			
 
				-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
			
 
				+echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
			
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -58,19 +58,6 @@ init_vars
 
				 git_module_setup
			
 
				 apply_patches
			
 
				 
			
 
				-init_vars
			
 
				-if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
			
 
				-    # Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
			
 
				-    # Enables optimized Dockerfile builds using a blanket skip and targeted overrides
			
 
				-    # Static build for linking into the Go binary
			
 
				-    init_vars
			
 
				-    CMAKE_TARGETS="--target llama --target ggml"
			
 
				-    CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}"
			
 
				-    BUILD_DIR="../build/linux/${ARCH}_static"
			
 
				-    echo "Building static library"
			
 
				-    build
			
 
				-fi
			
 
				-
			
 
				 init_vars
			
 
				 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
			
 
				     # Users building from source can tune the exact flags we pass to cmake for configuring
			
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -177,40 +177,6 @@ function cleanup {
 
				 # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
			
 
				 # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
			
 
				 
			
 
				-
			
 
				-function build_static() {
			
 
				-    if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
			
 
				-        # GCC build for direct linking into the Go binary
			
 
				-        init_vars
			
 
				-        # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
			
 
				-        # as we need this to be compiled by gcc for golang to be able to link with itx
			
 
				-        write-host "Checking for MinGW..."
			
 
				-        # error action ensures we exit on failure
			
 
				-        get-command gcc
			
 
				-        get-command mingw32-make
			
 
				-        $oldTargets = $script:cmakeTargets
			
 
				-        $script:cmakeTargets = @("llama", "ggml")
			
 
				-        $script:cmakeDefs = @(
			
 
				-            "-G", "MinGW Makefiles"
			
 
				-            "-DCMAKE_C_COMPILER=gcc.exe",
			
 
				-            "-DCMAKE_CXX_COMPILER=g++.exe",
			
 
				-            "-DBUILD_SHARED_LIBS=off",
			
 
				-            "-DGGML_NATIVE=off",
			
 
				-            "-DGGML_AVX=off",
			
 
				-            "-DGGML_AVX2=off",
			
 
				-            "-DGGML_AVX512=off",
			
 
				-            "-DGGML_F16C=off",
			
 
				-            "-DGGML_FMA=off",
			
 
				-            "-DGGML_OPENMP=off")
			
 
				-        $script:buildDir="../build/windows/${script:ARCH}_static"
			
 
				-        write-host "Building static library"
			
 
				-        build
			
 
				-        $script:cmakeTargets = $oldTargets
			
 
				-    } else {
			
 
				-        write-host "Skipping CPU generation step as requested"
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				 function build_cpu($gen_arch) {
			
 
				     if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
			
 
				         # remaining llama.cpp builds use MSVC 
			
@@ -398,7 +364,6 @@ init_vars
 
				 if ($($args.count) -eq 0) {
			
 
				     git_module_setup
			
 
				     apply_patches
			
 
				-    build_static
			
 
				     if ($script:ARCH -eq "arm64") {
			
 
				         build_cpu("ARM64")
			
 
				     } else { # amd64
			
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -55,9 +55,9 @@ func (kv KV) ParameterCount() uint64 {
 
				 	return kv.u64("general.parameter_count")
			
 
				 }
			
 
				 
			
 
				-func (kv KV) FileType() fileType {
			
 
				+func (kv KV) FileType() FileType {
			
 
				 	if u64 := kv.u64("general.file_type"); u64 > 0 {
			
 
				-		return fileType(uint32(u64))
			
 
				+		return FileType(uint32(u64))
			
 
				 	}
			
 
				 
			
 
				 	return fileTypeUnknown
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,41 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
			
 
				-// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
			
 
				-// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
			
 
				-// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
			
 
				-// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
			
 
				-// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
			
 
				-// #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
			
 
				-// #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
			
 
				-// #include <stdlib.h>
			
 
				-// #include "llama.h"
			
 
				-import "C"
			
 
				-
			
 
				-import (
			
 
				-	"errors"
			
 
				-	"unsafe"
			
 
				-)
			
 
				-
			
 
				-// SystemInfo is an unused example of calling llama.cpp functions using CGo
			
 
				-func SystemInfo() string {
			
 
				-	return C.GoString(C.llama_print_system_info())
			
 
				-}
			
 
				-
			
 
				-func Quantize(infile, outfile string, ftype fileType) error {
			
 
				-	cinfile := C.CString(infile)
			
 
				-	defer C.free(unsafe.Pointer(cinfile))
			
 
				-
			
 
				-	coutfile := C.CString(outfile)
			
 
				-	defer C.free(unsafe.Pointer(coutfile))
			
 
				-
			
 
				-	params := C.llama_model_quantize_default_params()
			
 
				-	params.nthread = -1
			
 
				-	params.ftype = ftype.Value()
			
 
				-
			
 
				-	if rc := C.llama_model_quantize(cinfile, coutfile, &params); rc != 0 {
			
 
				-		return errors.New("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version")
			
 
				-	}
			
 
				-
			
 
				-	return nil
			
 
				-}
			
--- a/server/images.go
+++ b/server/images.go
@@ -26,6 +26,7 @@ import (
 
				 	"github.com/ollama/ollama/auth"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				+	"github.com/ollama/ollama/llama"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 	"github.com/ollama/ollama/parser"
			
 
				 	"github.com/ollama/ollama/template"
			
@@ -453,7 +454,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 
				 						defer temp.Close()
			
 
				 						defer os.Remove(temp.Name())
			
 
				 
			
 
				-						if err := llm.Quantize(blob, temp.Name(), want); err != nil {
			
 
				+						if err := llama.Quantize(blob, temp.Name(), want); err != nil {
			
 
				 							return err
			
 
				 						}