10 tháng trước cách đây · 4b13e564eb
--- a/llama/.gitignore
+++ b/llama/.gitignore
@@ -3,3 +3,4 @@
 
				 *.lib
			
 
				 *.exp
			
 
				 *.dll
			
 
				+*.o
			
--- a/llama/base64.hpp
+++ b/llama/base64.hpp
@@ -1,392 +1,392 @@
 
				-/*
			
 
				-This is free and unencumbered software released into the public domain.
			
 
				-
			
 
				-Anyone is free to copy, modify, publish, use, compile, sell, or
			
 
				-distribute this software, either in source code form or as a compiled
			
 
				-binary, for any purpose, commercial or non-commercial, and by any
			
 
				-means.
			
 
				-
			
 
				-In jurisdictions that recognize copyright laws, the author or authors
			
 
				-of this software dedicate any and all copyright interest in the
			
 
				-software to the public domain. We make this dedication for the benefit
			
 
				-of the public at large and to the detriment of our heirs and
			
 
				-successors. We intend this dedication to be an overt act of
			
 
				-relinquishment in perpetuity of all present and future rights to this
			
 
				-software under copyright law.
			
 
				-
			
 
				-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
			
 
				-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
			
 
				-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
			
 
				-OTHER DEALINGS IN THE SOFTWARE.
			
 
				-
			
 
				-For more information, please refer to <http://unlicense.org>
			
 
				-*/
			
 
				-
			
 
				-#ifndef PUBLIC_DOMAIN_BASE64_HPP_
			
 
				-#define PUBLIC_DOMAIN_BASE64_HPP_
			
 
				-
			
 
				-#include <cstdint>
			
 
				-#include <iterator>
			
 
				-#include <stdexcept>
			
 
				-#include <string>
			
 
				-
			
 
				-class base64_error : public std::runtime_error
			
 
				-{
			
 
				-public:
			
 
				-    using std::runtime_error::runtime_error;
			
 
				-};
			
 
				-
			
 
				-class base64
			
 
				-{
			
 
				-public:
			
 
				-    enum class alphabet
			
 
				-    {
			
 
				-        /** the alphabet is detected automatically */
			
 
				-        auto_,
			
 
				-        /** the standard base64 alphabet is used */
			
 
				-        standard,
			
 
				-        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
			
 
				-        url_filename_safe
			
 
				-    };
			
 
				-
			
 
				-    enum class decoding_behavior
			
 
				-    {
			
 
				-        /** if the input is not padded, the remaining bits are ignored */
			
 
				-        moderate,
			
 
				-        /** if a padding character is encounter decoding is finished */
			
 
				-        loose
			
 
				-    };
			
 
				-
			
 
				-    /**
			
 
				-     Encodes all the elements from `in_begin` to `in_end` to `out`.
			
 
				-
			
 
				-     @warning The source and destination cannot overlap. The destination must be able to hold at least
			
 
				-     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
			
 
				-
			
 
				-     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
			
 
				-     8 bits
			
 
				-     @tparam Output_iterator the destination; the elements written to it are from the type `char`
			
 
				-     @param in_begin the beginning of the source
			
 
				-     @param in_end the ending of the source
			
 
				-     @param out the destination iterator
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @returns the iterator to the next element past the last element copied
			
 
				-     @throws see `Input_iterator` and `Output_iterator`
			
 
				-    */
			
 
				-    template<typename Input_iterator, typename Output_iterator>
			
 
				-    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
			
 
				-                                  alphabet alphabet = alphabet::standard)
			
 
				-    {
			
 
				-        constexpr auto pad = '=';
			
 
				-        const char* alpha  = alphabet == alphabet::url_filename_safe
			
 
				-                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
			
 
				-                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
			
 
				-
			
 
				-        while (in_begin != in_end) {
			
 
				-            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
			
 
				-
			
 
				-            // first character
			
 
				-            i0 = static_cast<std::uint8_t>(*in_begin);
			
 
				-            ++in_begin;
			
 
				-
			
 
				-            *out = alpha[i0 >> 2 & 0x3f];
			
 
				-            ++out;
			
 
				-
			
 
				-            // part of first character and second
			
 
				-            if (in_begin != in_end) {
			
 
				-                i1 = static_cast<std::uint8_t>(*in_begin);
			
 
				-                ++in_begin;
			
 
				-
			
 
				-                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
			
 
				-                ++out;
			
 
				-            } else {
			
 
				-                *out = alpha[(i0 & 0x3) << 4];
			
 
				-                ++out;
			
 
				-
			
 
				-                // last padding
			
 
				-                *out = pad;
			
 
				-                ++out;
			
 
				-
			
 
				-                // last padding
			
 
				-                *out = pad;
			
 
				-                ++out;
			
 
				-
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            // part of second character and third
			
 
				-            if (in_begin != in_end) {
			
 
				-                i2 = static_cast<std::uint8_t>(*in_begin);
			
 
				-                ++in_begin;
			
 
				-
			
 
				-                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
			
 
				-                ++out;
			
 
				-            } else {
			
 
				-                *out = alpha[(i1 & 0xf) << 2];
			
 
				-                ++out;
			
 
				-
			
 
				-                // last padding
			
 
				-                *out = pad;
			
 
				-                ++out;
			
 
				-
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            // rest of third
			
 
				-            *out = alpha[i2 & 0x3f];
			
 
				-            ++out;
			
 
				-        }
			
 
				-
			
 
				-        return out;
			
 
				-    }
			
 
				-    /**
			
 
				-     Encodes a string.
			
 
				-
			
 
				-     @param str the string that should be encoded
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @returns the encoded base64 string
			
 
				-     @throws see base64::encode()
			
 
				-    */
			
 
				-    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
			
 
				-    {
			
 
				-        std::string result;
			
 
				-
			
 
				-        result.reserve(required_encode_size(str.length()) + 1);
			
 
				-
			
 
				-        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
			
 
				-
			
 
				-        return result;
			
 
				-    }
			
 
				-    /**
			
 
				-     Encodes a char array.
			
 
				-
			
 
				-     @param buffer the char array
			
 
				-     @param size the size of the array
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @returns the encoded string
			
 
				-    */
			
 
				-    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
			
 
				-    {
			
 
				-        std::string result;
			
 
				-
			
 
				-        result.reserve(required_encode_size(size) + 1);
			
 
				-
			
 
				-        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
			
 
				-
			
 
				-        return result;
			
 
				-    }
			
 
				-    /**
			
 
				-     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
			
 
				-     in other words: inplace decoding is possible.
			
 
				-
			
 
				-     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
			
 
				-     otherwise the behavior depends on the output iterator.
			
 
				-
			
 
				-     @tparam Input_iterator the source; the returned elements are cast to `char`
			
 
				-     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
			
 
				-     @param in_begin the beginning of the source
			
 
				-     @param in_end the ending of the source
			
 
				-     @param out the destination iterator
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @param behavior the behavior when an error was detected
			
 
				-     @returns the iterator to the next element past the last element copied
			
 
				-     @throws base64_error depending on the set behavior
			
 
				-     @throws see `Input_iterator` and `Output_iterator`
			
 
				-    */
			
 
				-    template<typename Input_iterator, typename Output_iterator>
			
 
				-    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
			
 
				-                                  alphabet alphabet          = alphabet::auto_,
			
 
				-                                  decoding_behavior behavior = decoding_behavior::moderate)
			
 
				-    {
			
 
				-        //constexpr auto pad = '=';
			
 
				-        std::uint8_t last  = 0;
			
 
				-        auto bits          = 0;
			
 
				-
			
 
				-        while (in_begin != in_end) {
			
 
				-            auto c = *in_begin;
			
 
				-            ++in_begin;
			
 
				-
			
 
				-            if (c == '=') {
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            auto part = _base64_value(alphabet, c);
			
 
				-
			
 
				-            // enough bits for one byte
			
 
				-            if (bits + 6 >= 8) {
			
 
				-                *out = (last << (8 - bits)) | (part >> (bits - 2));
			
 
				-                ++out;
			
 
				-
			
 
				-                bits -= 2;
			
 
				-            } else {
			
 
				-                bits += 6;
			
 
				-            }
			
 
				-
			
 
				-            last = part;
			
 
				-        }
			
 
				-
			
 
				-        // check padding
			
 
				-        if (behavior != decoding_behavior::loose) {
			
 
				-            while (in_begin != in_end) {
			
 
				-                auto c = *in_begin;
			
 
				-                ++in_begin;
			
 
				-
			
 
				-                if (c != '=') {
			
 
				-                    throw base64_error("invalid base64 character.");
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        return out;
			
 
				-    }
			
 
				-    /**
			
 
				-     Decodes a string.
			
 
				-
			
 
				-     @param str the base64 encoded string
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @param behavior the behavior when an error was detected
			
 
				-     @returns the decoded string
			
 
				-     @throws see base64::decode()
			
 
				-    */
			
 
				-    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
			
 
				-                              decoding_behavior behavior = decoding_behavior::moderate)
			
 
				-    {
			
 
				-        std::string result;
			
 
				-
			
 
				-        result.reserve(max_decode_size(str.length()));
			
 
				-
			
 
				-        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
			
 
				-
			
 
				-        return result;
			
 
				-    }
			
 
				-    /**
			
 
				-     Decodes a string.
			
 
				-
			
 
				-     @param buffer the base64 encoded buffer
			
 
				-     @param size the size of the buffer
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @param behavior the behavior when an error was detected
			
 
				-     @returns the decoded string
			
 
				-     @throws see base64::decode()
			
 
				-    */
			
 
				-    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
			
 
				-                              decoding_behavior behavior = decoding_behavior::moderate)
			
 
				-    {
			
 
				-        std::string result;
			
 
				-
			
 
				-        result.reserve(max_decode_size(size));
			
 
				-
			
 
				-        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
			
 
				-
			
 
				-        return result;
			
 
				-    }
			
 
				-    /**
			
 
				-     Decodes a string inplace.
			
 
				-
			
 
				-     @param[in,out] str the base64 encoded string
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @param behavior the behavior when an error was detected
			
 
				-     @throws base64::decode_inplace()
			
 
				-    */
			
 
				-    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
			
 
				-                               decoding_behavior behavior = decoding_behavior::moderate)
			
 
				-    {
			
 
				-        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
			
 
				-    }
			
 
				-    /**
			
 
				-     Decodes a char array inplace.
			
 
				-
			
 
				-     @param[in,out] str the string array
			
 
				-     @param size the length of the array
			
 
				-     @param alphabet which alphabet should be used
			
 
				-     @param behavior the behavior when an error was detected
			
 
				-     @returns the pointer to the next element past the last element decoded
			
 
				-     @throws base64::decode_inplace()
			
 
				-    */
			
 
				-    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
			
 
				-                                decoding_behavior behavior = decoding_behavior::moderate)
			
 
				-    {
			
 
				-        return decode(str, str + size, str, alphabet, behavior);
			
 
				-    }
			
 
				-    /**
			
 
				-     Returns the required decoding size for a given size. The value is calculated with the following formula:
			
 
				-
			
 
				-     $$
			
 
				-     \lceil \frac{size}{4} \rceil \cdot 3
			
 
				-     $$
			
 
				-
			
 
				-     @param size the size of the encoded input
			
 
				-     @returns the size of the resulting decoded buffer; this the absolute maximum
			
 
				-    */
			
 
				-    static std::size_t max_decode_size(std::size_t size) noexcept
			
 
				-    {
			
 
				-        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
			
 
				-    }
			
 
				-    /**
			
 
				-     Returns the required encoding size for a given size. The value is calculated with the following formula:
			
 
				-
			
 
				-     $$
			
 
				-     \lceil \frac{size}{3} \rceil \cdot 4
			
 
				-     $$
			
 
				-
			
 
				-     @param size the size of the decoded input
			
 
				-     @returns the size of the resulting encoded buffer
			
 
				-    */
			
 
				-    static std::size_t required_encode_size(std::size_t size) noexcept
			
 
				-    {
			
 
				-        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
			
 
				-    }
			
 
				-
			
 
				-private:
			
 
				-    static std::uint8_t _base64_value(alphabet& alphabet, char c)
			
 
				-    {
			
 
				-        if (c >= 'A' && c <= 'Z') {
			
 
				-            return c - 'A';
			
 
				-        } else if (c >= 'a' && c <= 'z') {
			
 
				-            return c - 'a' + 26;
			
 
				-        } else if (c >= '0' && c <= '9') {
			
 
				-            return c - '0' + 52;
			
 
				-        }
			
 
				-
			
 
				-        // comes down to alphabet
			
 
				-        if (alphabet == alphabet::standard) {
			
 
				-            if (c == '+') {
			
 
				-                return 62;
			
 
				-            } else if (c == '/') {
			
 
				-                return 63;
			
 
				-            }
			
 
				-        } else if (alphabet == alphabet::url_filename_safe) {
			
 
				-            if (c == '-') {
			
 
				-                return 62;
			
 
				-            } else if (c == '_') {
			
 
				-                return 63;
			
 
				-            }
			
 
				-        } // auto detect
			
 
				-        else {
			
 
				-            if (c == '+') {
			
 
				-                alphabet = alphabet::standard;
			
 
				-
			
 
				-                return 62;
			
 
				-            } else if (c == '/') {
			
 
				-                alphabet = alphabet::standard;
			
 
				-
			
 
				-                return 63;
			
 
				-            } else if (c == '-') {
			
 
				-                alphabet = alphabet::url_filename_safe;
			
 
				-
			
 
				-                return 62;
			
 
				-            } else if (c == '_') {
			
 
				-                alphabet = alphabet::url_filename_safe;
			
 
				-
			
 
				-                return 63;
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        throw base64_error("invalid base64 character.");
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-#endif // !PUBLIC_DOMAIN_BASE64_HPP_
			
 
				+/*

			
 
				+This is free and unencumbered software released into the public domain.

			
 
				+

			
 
				+Anyone is free to copy, modify, publish, use, compile, sell, or

			
 
				+distribute this software, either in source code form or as a compiled

			
 
				+binary, for any purpose, commercial or non-commercial, and by any

			
 
				+means.

			
 
				+

			
 
				+In jurisdictions that recognize copyright laws, the author or authors

			
 
				+of this software dedicate any and all copyright interest in the

			
 
				+software to the public domain. We make this dedication for the benefit

			
 
				+of the public at large and to the detriment of our heirs and

			
 
				+successors. We intend this dedication to be an overt act of

			
 
				+relinquishment in perpetuity of all present and future rights to this

			
 
				+software under copyright law.

			
 
				+

			
 
				+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

			
 
				+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

			
 
				+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

			
 
				+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR

			
 
				+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

			
 
				+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

			
 
				+OTHER DEALINGS IN THE SOFTWARE.

			
 
				+

			
 
				+For more information, please refer to <http://unlicense.org>

			
 
				+*/

			
 
				+

			
 
				+#ifndef PUBLIC_DOMAIN_BASE64_HPP_

			
 
				+#define PUBLIC_DOMAIN_BASE64_HPP_

			
 
				+

			
 
				+#include <cstdint>

			
 
				+#include <iterator>

			
 
				+#include <stdexcept>

			
 
				+#include <string>

			
 
				+

			
 
				+class base64_error : public std::runtime_error

			
 
				+{

			
 
				+public:

			
 
				+    using std::runtime_error::runtime_error;

			
 
				+};

			
 
				+

			
 
				+class base64

			
 
				+{

			
 
				+public:

			
 
				+    enum class alphabet

			
 
				+    {

			
 
				+        /** the alphabet is detected automatically */

			
 
				+        auto_,

			
 
				+        /** the standard base64 alphabet is used */

			
 
				+        standard,

			
 
				+        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/

			
 
				+        url_filename_safe

			
 
				+    };

			
 
				+

			
 
				+    enum class decoding_behavior

			
 
				+    {

			
 
				+        /** if the input is not padded, the remaining bits are ignored */

			
 
				+        moderate,

			
 
				+        /** if a padding character is encounter decoding is finished */

			
 
				+        loose

			
 
				+    };

			
 
				+

			
 
				+    /**

			
 
				+     Encodes all the elements from `in_begin` to `in_end` to `out`.

			
 
				+

			
 
				+     @warning The source and destination cannot overlap. The destination must be able to hold at least

			
 
				+     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.

			
 
				+

			
 
				+     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than

			
 
				+     8 bits

			
 
				+     @tparam Output_iterator the destination; the elements written to it are from the type `char`

			
 
				+     @param in_begin the beginning of the source

			
 
				+     @param in_end the ending of the source

			
 
				+     @param out the destination iterator

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @returns the iterator to the next element past the last element copied

			
 
				+     @throws see `Input_iterator` and `Output_iterator`

			
 
				+    */

			
 
				+    template<typename Input_iterator, typename Output_iterator>

			
 
				+    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,

			
 
				+                                  alphabet alphabet = alphabet::standard)

			
 
				+    {

			
 
				+        constexpr auto pad = '=';

			
 
				+        const char* alpha  = alphabet == alphabet::url_filename_safe

			
 
				+                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"

			
 
				+                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

			
 
				+

			
 
				+        while (in_begin != in_end) {

			
 
				+            std::uint8_t i0 = 0, i1 = 0, i2 = 0;

			
 
				+

			
 
				+            // first character

			
 
				+            i0 = static_cast<std::uint8_t>(*in_begin);

			
 
				+            ++in_begin;

			
 
				+

			
 
				+            *out = alpha[i0 >> 2 & 0x3f];

			
 
				+            ++out;

			
 
				+

			
 
				+            // part of first character and second

			
 
				+            if (in_begin != in_end) {

			
 
				+                i1 = static_cast<std::uint8_t>(*in_begin);

			
 
				+                ++in_begin;

			
 
				+

			
 
				+                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];

			
 
				+                ++out;

			
 
				+            } else {

			
 
				+                *out = alpha[(i0 & 0x3) << 4];

			
 
				+                ++out;

			
 
				+

			
 
				+                // last padding

			
 
				+                *out = pad;

			
 
				+                ++out;

			
 
				+

			
 
				+                // last padding

			
 
				+                *out = pad;

			
 
				+                ++out;

			
 
				+

			
 
				+                break;

			
 
				+            }

			
 
				+

			
 
				+            // part of second character and third

			
 
				+            if (in_begin != in_end) {

			
 
				+                i2 = static_cast<std::uint8_t>(*in_begin);

			
 
				+                ++in_begin;

			
 
				+

			
 
				+                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];

			
 
				+                ++out;

			
 
				+            } else {

			
 
				+                *out = alpha[(i1 & 0xf) << 2];

			
 
				+                ++out;

			
 
				+

			
 
				+                // last padding

			
 
				+                *out = pad;

			
 
				+                ++out;

			
 
				+

			
 
				+                break;

			
 
				+            }

			
 
				+

			
 
				+            // rest of third

			
 
				+            *out = alpha[i2 & 0x3f];

			
 
				+            ++out;

			
 
				+        }

			
 
				+

			
 
				+        return out;

			
 
				+    }

			
 
				+    /**

			
 
				+     Encodes a string.

			
 
				+

			
 
				+     @param str the string that should be encoded

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @returns the encoded base64 string

			
 
				+     @throws see base64::encode()

			
 
				+    */

			
 
				+    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)

			
 
				+    {

			
 
				+        std::string result;

			
 
				+

			
 
				+        result.reserve(required_encode_size(str.length()) + 1);

			
 
				+

			
 
				+        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);

			
 
				+

			
 
				+        return result;

			
 
				+    }

			
 
				+    /**

			
 
				+     Encodes a char array.

			
 
				+

			
 
				+     @param buffer the char array

			
 
				+     @param size the size of the array

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @returns the encoded string

			
 
				+    */

			
 
				+    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)

			
 
				+    {

			
 
				+        std::string result;

			
 
				+

			
 
				+        result.reserve(required_encode_size(size) + 1);

			
 
				+

			
 
				+        encode(buffer, buffer + size, std::back_inserter(result), alphabet);

			
 
				+

			
 
				+        return result;

			
 
				+    }

			
 
				+    /**

			
 
				+     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,

			
 
				+     in other words: inplace decoding is possible.

			
 
				+

			
 
				+     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,

			
 
				+     otherwise the behavior depends on the output iterator.

			
 
				+

			
 
				+     @tparam Input_iterator the source; the returned elements are cast to `char`

			
 
				+     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`

			
 
				+     @param in_begin the beginning of the source

			
 
				+     @param in_end the ending of the source

			
 
				+     @param out the destination iterator

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @param behavior the behavior when an error was detected

			
 
				+     @returns the iterator to the next element past the last element copied

			
 
				+     @throws base64_error depending on the set behavior

			
 
				+     @throws see `Input_iterator` and `Output_iterator`

			
 
				+    */

			
 
				+    template<typename Input_iterator, typename Output_iterator>

			
 
				+    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,

			
 
				+                                  alphabet alphabet          = alphabet::auto_,

			
 
				+                                  decoding_behavior behavior = decoding_behavior::moderate)

			
 
				+    {

			
 
				+        //constexpr auto pad = '=';

			
 
				+        std::uint8_t last  = 0;

			
 
				+        auto bits          = 0;

			
 
				+

			
 
				+        while (in_begin != in_end) {

			
 
				+            auto c = *in_begin;

			
 
				+            ++in_begin;

			
 
				+

			
 
				+            if (c == '=') {

			
 
				+                break;

			
 
				+            }

			
 
				+

			
 
				+            auto part = _base64_value(alphabet, c);

			
 
				+

			
 
				+            // enough bits for one byte

			
 
				+            if (bits + 6 >= 8) {

			
 
				+                *out = (last << (8 - bits)) | (part >> (bits - 2));

			
 
				+                ++out;

			
 
				+

			
 
				+                bits -= 2;

			
 
				+            } else {

			
 
				+                bits += 6;

			
 
				+            }

			
 
				+

			
 
				+            last = part;

			
 
				+        }

			
 
				+

			
 
				+        // check padding

			
 
				+        if (behavior != decoding_behavior::loose) {

			
 
				+            while (in_begin != in_end) {

			
 
				+                auto c = *in_begin;

			
 
				+                ++in_begin;

			
 
				+

			
 
				+                if (c != '=') {

			
 
				+                    throw base64_error("invalid base64 character.");

			
 
				+                }

			
 
				+            }

			
 
				+        }

			
 
				+

			
 
				+        return out;

			
 
				+    }

			
 
				+    /**

			
 
				+     Decodes a string.

			
 
				+

			
 
				+     @param str the base64 encoded string

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @param behavior the behavior when an error was detected

			
 
				+     @returns the decoded string

			
 
				+     @throws see base64::decode()

			
 
				+    */

			
 
				+    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,

			
 
				+                              decoding_behavior behavior = decoding_behavior::moderate)

			
 
				+    {

			
 
				+        std::string result;

			
 
				+

			
 
				+        result.reserve(max_decode_size(str.length()));

			
 
				+

			
 
				+        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);

			
 
				+

			
 
				+        return result;

			
 
				+    }

			
 
				+    /**

			
 
				+     Decodes a string.

			
 
				+

			
 
				+     @param buffer the base64 encoded buffer

			
 
				+     @param size the size of the buffer

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @param behavior the behavior when an error was detected

			
 
				+     @returns the decoded string

			
 
				+     @throws see base64::decode()

			
 
				+    */

			
 
				+    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,

			
 
				+                              decoding_behavior behavior = decoding_behavior::moderate)

			
 
				+    {

			
 
				+        std::string result;

			
 
				+

			
 
				+        result.reserve(max_decode_size(size));

			
 
				+

			
 
				+        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);

			
 
				+

			
 
				+        return result;

			
 
				+    }

			
 
				+    /**

			
 
				+     Decodes a string inplace.

			
 
				+

			
 
				+     @param[in,out] str the base64 encoded string

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @param behavior the behavior when an error was detected

			
 
				+     @throws base64::decode_inplace()

			
 
				+    */

			
 
				+    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,

			
 
				+                               decoding_behavior behavior = decoding_behavior::moderate)

			
 
				+    {

			
 
				+        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());

			
 
				+    }

			
 
				+    /**

			
 
				+     Decodes a char array inplace.

			
 
				+

			
 
				+     @param[in,out] str the string array

			
 
				+     @param size the length of the array

			
 
				+     @param alphabet which alphabet should be used

			
 
				+     @param behavior the behavior when an error was detected

			
 
				+     @returns the pointer to the next element past the last element decoded

			
 
				+     @throws base64::decode_inplace()

			
 
				+    */

			
 
				+    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,

			
 
				+                                decoding_behavior behavior = decoding_behavior::moderate)

			
 
				+    {

			
 
				+        return decode(str, str + size, str, alphabet, behavior);

			
 
				+    }

			
 
				+    /**

			
 
				+     Returns the required decoding size for a given size. The value is calculated with the following formula:

			
 
				+

			
 
				+     $$

			
 
				+     \lceil \frac{size}{4} \rceil \cdot 3

			
 
				+     $$

			
 
				+

			
 
				+     @param size the size of the encoded input

			
 
				+     @returns the size of the resulting decoded buffer; this the absolute maximum

			
 
				+    */

			
 
				+    static std::size_t max_decode_size(std::size_t size) noexcept

			
 
				+    {

			
 
				+        return (size / 4 + (size % 4 ? 1 : 0)) * 3;

			
 
				+    }

			
 
				+    /**

			
 
				+     Returns the required encoding size for a given size. The value is calculated with the following formula:

			
 
				+

			
 
				+     $$

			
 
				+     \lceil \frac{size}{3} \rceil \cdot 4

			
 
				+     $$

			
 
				+

			
 
				+     @param size the size of the decoded input

			
 
				+     @returns the size of the resulting encoded buffer

			
 
				+    */

			
 
				+    static std::size_t required_encode_size(std::size_t size) noexcept

			
 
				+    {

			
 
				+        return (size / 3 + (size % 3 ? 1 : 0)) * 4;

			
 
				+    }

			
 
				+

			
 
				+private:

			
 
				+    static std::uint8_t _base64_value(alphabet& alphabet, char c)

			
 
				+    {

			
 
				+        if (c >= 'A' && c <= 'Z') {

			
 
				+            return c - 'A';

			
 
				+        } else if (c >= 'a' && c <= 'z') {

			
 
				+            return c - 'a' + 26;

			
 
				+        } else if (c >= '0' && c <= '9') {

			
 
				+            return c - '0' + 52;

			
 
				+        }

			
 
				+

			
 
				+        // comes down to alphabet

			
 
				+        if (alphabet == alphabet::standard) {

			
 
				+            if (c == '+') {

			
 
				+                return 62;

			
 
				+            } else if (c == '/') {

			
 
				+                return 63;

			
 
				+            }

			
 
				+        } else if (alphabet == alphabet::url_filename_safe) {

			
 
				+            if (c == '-') {

			
 
				+                return 62;

			
 
				+            } else if (c == '_') {

			
 
				+                return 63;

			
 
				+            }

			
 
				+        } // auto detect

			
 
				+        else {

			
 
				+            if (c == '+') {

			
 
				+                alphabet = alphabet::standard;

			
 
				+

			
 
				+                return 62;

			
 
				+            } else if (c == '/') {

			
 
				+                alphabet = alphabet::standard;

			
 
				+

			
 
				+                return 63;

			
 
				+            } else if (c == '-') {

			
 
				+                alphabet = alphabet::url_filename_safe;

			
 
				+

			
 
				+                return 62;

			
 
				+            } else if (c == '_') {

			
 
				+                alphabet = alphabet::url_filename_safe;

			
 
				+

			
 
				+                return 63;

			
 
				+            }

			
 
				+        }

			
 
				+

			
 
				+        throw base64_error("invalid base64 character.");

			
 
				+    }

			
 
				+};

			
 
				+

			
 
				+#endif // !PUBLIC_DOMAIN_BASE64_HPP_

			
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/build_cuda.sh
+++ b/llama/build_cuda.sh
@@ -9,7 +9,7 @@ else
 
				 fi
			
 
				 
			
 
				 nvcc \
			
 
				-    -t 12 \
			
 
				+    -t $(nproc) \
			
 
				     --generate-code=arch=compute_50,code=[compute_50,sm_50] \
			
 
				     --generate-code=arch=compute_52,code=[compute_52,sm_52] \
			
 
				     --generate-code=arch=compute_61,code=[compute_61,sm_61] \
			
@@ -30,9 +30,18 @@ nvcc \
 
				     -use_fast_math \
			
 
				     -link \
			
 
				     -shared \
			
 
				-    -fPIC \
			
 
				     -I. \
			
 
				     -lcuda -lcublas -lcudart -lcublasLt \
			
 
				     -O3 \
			
 
				     -o $output \
			
 
				-    ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
			
 
				+    ggml-cuda.cu \
			
 
				+    ggml-cuda/*.cu \
			
 
				+    ggml-cuda/template-instances/fattn-wmma*.cu \
			
 
				+    ggml-cuda/template-instances/mmq*.cu \
			
 
				+    ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu \
			
 
				+    ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu \
			
 
				+    ggml-cuda/template-instances/fattn-vec*f16-f16.cu \
			
 
				+    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
			
 
				+
			
 
				+#   -DGGML_CUDA_USE_GRAPHS=1 
			
 
				+#   -DGGML_CUDA_FA_ALL_QUANTS=1
			
--- a/llama/build_hipblas.sh
+++ b/llama/build_hipblas.sh
@@ -26,7 +26,7 @@ additional_flags=""
 
				 
			
 
				 if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
			
 
				     output="ggml-hipblas.dll"
			
 
				-    additional_flags=" -Xclang --dependent-lib=msvcrt -Wl,/subsystem:console"
			
 
				+    additional_flags=" -Xclang --dependent-lib=msvcrt"
			
 
				 else
			
 
				     output="libggml-hipblas.so"
			
 
				     archs+=("${linux_archs[@]}")
			
@@ -36,37 +36,61 @@ for arch in "${archs[@]}"; do
 
				     additional_flags+=" --offload-arch=$arch"
			
 
				 done
			
 
				 
			
 
				-hipcc \
			
 
				-    -v \
			
 
				-    -parallel-jobs=12 \
			
 
				-    -O3 \
			
 
				-    -DGGML_USE_CUDA \
			
 
				-    -DGGML_BUILD=1 \
			
 
				-    -DGGML_SHARED=1 \
			
 
				-    -DGGML_CUDA_DMMV_X=32 \
			
 
				-    -DGGML_CUDA_MMV_Y=1 \
			
 
				-    -DGGML_SCHED_MAX_COPIES=4 \
			
 
				-    -DGGML_USE_HIPBLAS \
			
 
				-    -DGGML_USE_LLAMAFILE \
			
 
				-    -DHIP_FAST_MATH \
			
 
				-    -DNDEBUG \
			
 
				-    -DK_QUANTS_PER_ITERATION=2 \
			
 
				-    -D_CRT_SECURE_NO_WARNINGS \
			
 
				-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
			
 
				-    -D_GNU_SOURCE \
			
 
				-    -Wno-expansion-to-defined \
			
 
				-    -Wno-invalid-noreturn \
			
 
				-    -Wno-ignored-attributes \
			
 
				-    -Wno-pass-failed \
			
 
				-    -Wno-deprecated-declarations \
			
 
				-    -Wno-unused-result \
			
 
				-    -I. \
			
 
				-    -lhipblas -lamdhip64 -lrocblas \
			
 
				-    -shared \
			
 
				-    $additional_flags \
			
 
				-    -o $output \
			
 
				-    ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
			
 
				+# Create an array of all source files, expanding globs
			
 
				+sources=(
			
 
				+    $(echo ggml-cuda/template-instances/fattn-wmma*.cu)
			
 
				+    $(echo ggml-cuda/template-instances/mmq*.cu)
			
 
				+    $(echo ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)
			
 
				+    $(echo ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)
			
 
				+    $(echo ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
			
 
				+    ggml-cuda.cu
			
 
				+    $(echo ggml-cuda/*.cu)
			
 
				+    ggml.c
			
 
				+    ggml-backend.c
			
 
				+    ggml-alloc.c
			
 
				+    ggml-quants.c
			
 
				+    sgemm.cpp
			
 
				+)
			
 
				+
			
 
				+# Function to compile a single source file
			
 
				+compile_source() {
			
 
				+    src="$1"
			
 
				+    hipcc -c -O3 -DGGML_USE_CUDA -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 \
			
 
				+          -DGGML_SCHED_MAX_COPIES=4 -DGGML_USE_HIPBLAS -DGGML_USE_LLAMAFILE -DHIP_FAST_MATH -DNDEBUG \
			
 
				+          -DK_QUANTS_PER_ITERATION=2 -D_CRT_SECURE_NO_WARNINGS -DCMAKE_POSITION_INDEPENDENT_CODE=on \
			
 
				+          -D_GNU_SOURCE -Wno-expansion-to-defined -Wno-invalid-noreturn -Wno-ignored-attributes -Wno-pass-failed \
			
 
				+          -Wno-deprecated-declarations -Wno-unused-result -I. \
			
 
				+          $additional_flags -o "${src%.cu}.o" "$src"
			
 
				+}
			
 
				+
			
 
				+# Function to handle Ctrl+C
			
 
				+cleanup() {
			
 
				+    echo "Terminating all background processes..."
			
 
				+    kill 0
			
 
				+}
			
 
				+
			
 
				+# Set trap to handle SIGINT (Ctrl+C)
			
 
				+trap cleanup SIGINT
			
 
				+
			
 
				+# Limit the number of concurrent jobs
			
 
				+max_jobs=$(nproc)
			
 
				+job_count=0
			
 
				+
			
 
				+for src in "${sources[@]}"; do
			
 
				+    echo "$src"
			
 
				+    compile_source "$src" &
			
 
				+    job_count=$((job_count + 1))
			
 
				+    if [[ $job_count -ge $max_jobs ]]; then
			
 
				+        wait -n
			
 
				+        job_count=$((job_count - 1))
			
 
				+    fi
			
 
				+done
			
 
				+
			
 
				+wait
			
 
				+
			
 
				+# Link all object files into a shared library
			
 
				+echo "Linking object files..."
			
 
				+hipcc -v -shared -o $output *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o -lhipblas -lamdhip64 -lrocblas
			
 
				 
			
 
				-    # -D_DLL \
			
 
				-    # -D_MT \
			
 
				-    # -D_XOPEN_SOURCE=600 \
			
 
				+# Clean up object files after linking
			
 
				+rm -f *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o
			
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -226,19 +226,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
 
				             }
			
 
				             params.hf_file = params.model;
			
 
				         } else if (params.model.empty()) {
			
 
				-            std::string cache_directory = fs_get_cache_directory();
			
 
				-            const bool success = fs_create_directory_with_parents(cache_directory);
			
 
				-            if (!success) {
			
 
				-                throw std::runtime_error("failed to create cache directory: " + cache_directory);
			
 
				-            }
			
 
				-            params.model = cache_directory + string_split(params.hf_file, '/').back();
			
 
				+            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
			
 
				         }
			
 
				     } else if (!params.model_url.empty()) {
			
 
				         if (params.model.empty()) {
			
 
				             auto f = string_split(params.model_url, '#').front();
			
 
				             f = string_split(f, '?').front();
			
 
				-            f = string_split(f, '/').back();
			
 
				-            params.model =  "models/" + f;
			
 
				+            params.model = fs_get_cache_file(string_split(f, '/').back());
			
 
				         }
			
 
				     } else if (params.model.empty()) {
			
 
				         params.model = DEFAULT_MODEL_PATH;
			
@@ -1517,6 +1511,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 
				         params.chat_template = argv[i];
			
 
				         return true;
			
 
				     }
			
 
				+    if (arg == "--slot-prompt-similarity" || arg == "-sps") {
			
 
				+        if (++i >= argc) {
			
 
				+            invalid_param = true;
			
 
				+            return true;
			
 
				+        }
			
 
				+        params.slot_prompt_similarity = std::stof(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				     if (arg == "-pps") {
			
 
				         params.is_pp_shared = true;
			
 
				         return true;
			
@@ -1939,6 +1941,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
				                                                                         "set custom jinja chat template (default: template taken from model's metadata)\n"
			
 
				                                                                         "only commonly used templates are accepted:\n"
			
 
				                                                                         "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
			
 
				+    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
			
 
				+                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
			
 
				 
			
 
				 #ifndef LOG_DISABLE_LOGS
			
 
				     options.push_back({ "logging" });
			
@@ -2295,6 +2299,16 @@ std::string fs_get_cache_directory() {
 
				     return ensure_trailing_slash(cache_directory);
			
 
				 }
			
 
				 
			
 
				+std::string fs_get_cache_file(const std::string & filename) {
			
 
				+    GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
			
 
				+    std::string cache_directory = fs_get_cache_directory();
			
 
				+    const bool success = fs_create_directory_with_parents(cache_directory);
			
 
				+    if (!success) {
			
 
				+        throw std::runtime_error("failed to create cache directory: " + cache_directory);
			
 
				+    }
			
 
				+    return cache_directory + filename;
			
 
				+}
			
 
				+
			
 
				 
			
 
				 //
			
 
				 // Model utils
			
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -229,6 +229,8 @@ struct gpt_params {
 
				 
			
 
				     std::string slot_save_path;
			
 
				 
			
 
				+    float slot_prompt_similarity = 0.5f;
			
 
				+
			
 
				     // batched-bench params
			
 
				     bool is_pp_shared = false;
			
 
				 
			
@@ -301,6 +303,7 @@ bool fs_validate_filename(const std::string & filename);
 
				 bool fs_create_directory_with_parents(const std::string & path);
			
 
				 
			
 
				 std::string fs_get_cache_directory();
			
 
				+std::string fs_get_cache_file(const std::string & filename);
			
 
				 
			
 
				 //
			
 
				 // Model utils
			
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -24,79 +24,79 @@
 
				  * SOFTWARE.
			
 
				  */
			
 
				 
			
 
				-#pragma once
			
 
				-
			
 
				-#include "ggml.h"
			
 
				-
			
 
				-#ifdef  __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
			
 
				-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
			
 
				-typedef struct ggml_backend * ggml_backend_t;
			
 
				-
			
 
				-// Tensor allocator
			
 
				-struct ggml_tallocr {
			
 
				-    ggml_backend_buffer_t buffer;
			
 
				-    void * base;
			
 
				-    size_t alignment;
			
 
				-    size_t offset;
			
 
				-};
			
 
				-
			
 
				-GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
			
 
				-GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
			
 
				-
			
 
				-// Graph allocator
			
 
				-/*
			
 
				-  Example usage:
			
 
				-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
			
 
				-
			
 
				-    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
			
 
				-    ggml_gallocr_reserve(galloc, build_graph(max_batch));
			
 
				-
			
 
				-    // allocate the graph
			
 
				-    struct ggml_cgraph * graph = build_graph(batch);
			
 
				-    ggml_gallocr_alloc_graph(galloc, graph);
			
 
				-
			
 
				-    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
			
 
				-
			
 
				-    // evaluate the graph
			
 
				-    ggml_backend_graph_compute(backend, graph);
			
 
				-*/
			
 
				-
			
 
				-// special tensor flags for use with the graph allocator:
			
 
				-//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
			
 
				-//   ggml_set_output(): output tensors are never freed and never overwritten
			
 
				-
			
 
				-typedef struct ggml_gallocr * ggml_gallocr_t;
			
 
				-
			
 
				-GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
			
 
				-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
			
 
				-GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
			
 
				-
			
 
				-// pre-allocate buffers from a measure graph - does not allocate or modify the graph
			
 
				-// call with a worst-case graph to avoid buffer reallocations
			
 
				-// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
			
 
				-// returns false if the buffer allocation failed
			
 
				-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
			
 
				-GGML_API bool ggml_gallocr_reserve_n(
			
 
				-    ggml_gallocr_t galloc,
			
 
				-    struct ggml_cgraph * graph,
			
 
				-    const int * node_buffer_ids,
			
 
				-    const int * leaf_buffer_ids);
			
 
				-
			
 
				-// automatic reallocation if the topology changes when using a single buffer
			
 
				-// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
			
 
				-GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
			
 
				-
			
 
				-GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
			
 
				-
			
 
				-// Utils
			
 
				-// Create a buffer and allocate all the tensors in a ggml_context
			
 
				-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
			
 
				-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
			
 
				-
			
 
				-#ifdef  __cplusplus
			
 
				-}
			
 
				-#endif
			
 
				+#pragma once

			
 
				+

			
 
				+#include "ggml.h"

			
 
				+

			
 
				+#ifdef  __cplusplus

			
 
				+extern "C" {

			
 
				+#endif

			
 
				+

			
 
				+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;

			
 
				+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;

			
 
				+typedef struct ggml_backend * ggml_backend_t;

			
 
				+

			
 
				+// Tensor allocator

			
 
				+struct ggml_tallocr {

			
 
				+    ggml_backend_buffer_t buffer;

			
 
				+    void * base;

			
 
				+    size_t alignment;

			
 
				+    size_t offset;

			
 
				+};

			
 
				+

			
 
				+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);

			
 
				+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);

			
 
				+

			
 
				+// Graph allocator

			
 
				+/*

			
 
				+  Example usage:

			
 
				+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());

			
 
				+

			
 
				+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations

			
 
				+    ggml_gallocr_reserve(galloc, build_graph(max_batch));

			
 
				+

			
 
				+    // allocate the graph

			
 
				+    struct ggml_cgraph * graph = build_graph(batch);

			
 
				+    ggml_gallocr_alloc_graph(galloc, graph);

			
 
				+

			
 
				+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));

			
 
				+

			
 
				+    // evaluate the graph

			
 
				+    ggml_backend_graph_compute(backend, graph);

			
 
				+*/

			
 
				+

			
 
				+// special tensor flags for use with the graph allocator:

			
 
				+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses

			
 
				+//   ggml_set_output(): output tensors are never freed and never overwritten

			
 
				+

			
 
				+typedef struct ggml_gallocr * ggml_gallocr_t;

			
 
				+

			
 
				+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);

			
 
				+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);

			
 
				+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);

			
 
				+

			
 
				+// pre-allocate buffers from a measure graph - does not allocate or modify the graph

			
 
				+// call with a worst-case graph to avoid buffer reallocations

			
 
				+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed

			
 
				+// returns false if the buffer allocation failed

			
 
				+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

			
 
				+GGML_API bool ggml_gallocr_reserve_n(

			
 
				+    ggml_gallocr_t galloc,

			
 
				+    struct ggml_cgraph * graph,

			
 
				+    const int * node_buffer_ids,

			
 
				+    const int * leaf_buffer_ids);

			
 
				+

			
 
				+// automatic reallocation if the topology changes when using a single buffer

			
 
				+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)

			
 
				+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

			
 
				+

			
 
				+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

			
 
				+

			
 
				+// Utils

			
 
				+// Create a buffer and allocate all the tensors in a ggml_context

			
 
				+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);

			
 
				+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

			
 
				+

			
 
				+#ifdef  __cplusplus

			
 
				+}

			
 
				+#endif

			
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -24,144 +24,144 @@
 
				  * SOFTWARE.
			
 
				  */
			
 
				 
			
 
				-#pragma once
			
 
				-
			
 
				-// ggml-backend internal header
			
 
				-
			
 
				-#include "ggml-backend.h"
			
 
				-
			
 
				-#ifdef  __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-    //
			
 
				-    // Backend buffer
			
 
				-    //
			
 
				-
			
 
				-    // buffer type
			
 
				-    typedef void * ggml_backend_buffer_type_context_t;
			
 
				-
			
 
				-    struct ggml_backend_buffer_type_i {
			
 
				-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
			
 
				-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
			
 
				-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
			
 
				-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
			
 
				-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
			
 
				-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
			
 
				-        // check if tensor data is in host memory
			
 
				-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
			
 
				-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
			
 
				-    };
			
 
				-
			
 
				-    struct ggml_backend_buffer_type {
			
 
				-        struct ggml_backend_buffer_type_i  iface;
			
 
				-        ggml_backend_buffer_type_context_t context;
			
 
				-    };
			
 
				-
			
 
				-    // buffer
			
 
				-    typedef void * ggml_backend_buffer_context_t;
			
 
				-
			
 
				-    struct ggml_backend_buffer_i {
			
 
				-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
			
 
				-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
			
 
				-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
			
 
				-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
			
 
				-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
			
 
				-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
			
 
				-    };
			
 
				-
			
 
				-    struct ggml_backend_buffer {
			
 
				-        struct ggml_backend_buffer_i  iface;
			
 
				-        ggml_backend_buffer_type_t    buft;
			
 
				-        ggml_backend_buffer_context_t context;
			
 
				-        size_t size;
			
 
				-        enum ggml_backend_buffer_usage usage;
			
 
				-    };
			
 
				-
			
 
				-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
			
 
				-                   ggml_backend_buffer_type_t      buft,
			
 
				-            struct ggml_backend_buffer_i           iface,
			
 
				-                   ggml_backend_buffer_context_t   context,
			
 
				-                   size_t                          size);
			
 
				-
			
 
				-    // do not use directly, use ggml_backend_tensor_copy instead
			
 
				-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
			
 
				-
			
 
				-    // buffer that contains a collection of buffers
			
 
				-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
			
 
				-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
			
 
				-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
			
 
				-
			
 
				-    //
			
 
				-    // Backend
			
 
				-    //
			
 
				-
			
 
				-    typedef void * ggml_backend_context_t;
			
 
				-
			
 
				-    struct ggml_backend_i {
			
 
				-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
			
 
				-
			
 
				-        void (*GGML_CALL free)(ggml_backend_t backend);
			
 
				-
			
 
				-        // buffer allocation
			
 
				-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
			
 
				-
			
 
				-        // (optional) asynchronous tensor data access
			
 
				-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
			
 
				-
			
 
				-        // (optional) complete all pending operations
			
 
				-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
			
 
				-
			
 
				-        // compute graph with a plan (not used currently)
			
 
				-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
			
 
				-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
			
 
				-
			
 
				-        // compute graph with a plan
			
 
				-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
			
 
				-        // compute graph without a plan (async)
			
 
				-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
			
 
				-
			
 
				-        // check if the backend supports an operation
			
 
				-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
			
 
				-
			
 
				-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
			
 
				-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
			
 
				-        // even if the weight has to be copied from the CPU temporarily
			
 
				-        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
			
 
				-
			
 
				-        // (optional) event synchronization
			
 
				-        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
			
 
				-        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
			
 
				-        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
			
 
				-        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
			
 
				-        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
			
 
				-    };
			
 
				-
			
 
				-    struct ggml_backend {
			
 
				-        ggml_guid_t guid;
			
 
				-
			
 
				-        struct ggml_backend_i iface;
			
 
				-        ggml_backend_context_t context;
			
 
				-    };
			
 
				-
			
 
				-    struct ggml_backend_event {
			
 
				-        ggml_backend_t backend;
			
 
				-        void * context;
			
 
				-    };
			
 
				-
			
 
				-    //
			
 
				-    // Backend registry
			
 
				-    //
			
 
				-
			
 
				-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
			
 
				-
			
 
				-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
			
 
				-
			
 
				-#ifdef  __cplusplus
			
 
				-}
			
 
				-#endif
			
 
				+#pragma once

			
 
				+

			
 
				+// ggml-backend internal header

			
 
				+

			
 
				+#include "ggml-backend.h"

			
 
				+

			
 
				+#ifdef  __cplusplus

			
 
				+extern "C" {

			
 
				+#endif

			
 
				+

			
 
				+    //

			
 
				+    // Backend buffer

			
 
				+    //

			
 
				+

			
 
				+    // buffer type

			
 
				+    typedef void * ggml_backend_buffer_type_context_t;

			
 
				+

			
 
				+    struct ggml_backend_buffer_type_i {

			
 
				+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);

			
 
				+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);

			
 
				+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment

			
 
				+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size

			
 
				+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding

			
 
				+        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend

			
 
				+        // check if tensor data is in host memory

			
 
				+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())

			
 
				+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);

			
 
				+    };

			
 
				+

			
 
				+    struct ggml_backend_buffer_type {

			
 
				+        struct ggml_backend_buffer_type_i  iface;

			
 
				+        ggml_backend_buffer_type_context_t context;

			
 
				+    };

			
 
				+

			
 
				+    // buffer

			
 
				+    typedef void * ggml_backend_buffer_context_t;

			
 
				+

			
 
				+    struct ggml_backend_buffer_i {

			
 
				+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);

			
 
				+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);

			
 
				+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);

			
 
				+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

			
 
				+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

			
 
				+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

			
 
				+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer

			
 
				+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);

			
 
				+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras

			
 
				+    };

			
 
				+

			
 
				+    struct ggml_backend_buffer {

			
 
				+        struct ggml_backend_buffer_i  iface;

			
 
				+        ggml_backend_buffer_type_t    buft;

			
 
				+        ggml_backend_buffer_context_t context;

			
 
				+        size_t size;

			
 
				+        enum ggml_backend_buffer_usage usage;

			
 
				+    };

			
 
				+

			
 
				+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(

			
 
				+                   ggml_backend_buffer_type_t      buft,

			
 
				+            struct ggml_backend_buffer_i           iface,

			
 
				+                   ggml_backend_buffer_context_t   context,

			
 
				+                   size_t                          size);

			
 
				+

			
 
				+    // do not use directly, use ggml_backend_tensor_copy instead

			
 
				+    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

			
 
				+

			
 
				+    // buffer that contains a collection of buffers

			
 
				+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);

			
 
				+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);

			
 
				+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

			
 
				+

			
 
				+    //

			
 
				+    // Backend

			
 
				+    //

			
 
				+

			
 
				+    typedef void * ggml_backend_context_t;

			
 
				+

			
 
				+    struct ggml_backend_i {

			
 
				+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);

			
 
				+

			
 
				+        void (*GGML_CALL free)(ggml_backend_t backend);

			
 
				+

			
 
				+        // buffer allocation

			
 
				+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);

			
 
				+

			
 
				+        // (optional) asynchronous tensor data access

			
 
				+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

			
 
				+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

			
 
				+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

			
 
				+

			
 
				+        // (optional) complete all pending operations

			
 
				+        void (*GGML_CALL synchronize)(ggml_backend_t backend);

			
 
				+

			
 
				+        // compute graph with a plan (not used currently)

			
 
				+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);

			
 
				+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

			
 
				+

			
 
				+        // compute graph with a plan

			
 
				+        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

			
 
				+        // compute graph without a plan (async)

			
 
				+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);

			
 
				+

			
 
				+        // check if the backend supports an operation

			
 
				+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

			
 
				+

			
 
				+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer

			
 
				+        // these should be expensive operations with large batch sizes that may benefit from running on this backend

			
 
				+        // even if the weight has to be copied from the CPU temporarily

			
 
				+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);

			
 
				+

			
 
				+        // (optional) event synchronization

			
 
				+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);

			
 
				+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);

			
 
				+        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);

			
 
				+        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);

			
 
				+        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);

			
 
				+    };

			
 
				+

			
 
				+    struct ggml_backend {

			
 
				+        ggml_guid_t guid;

			
 
				+

			
 
				+        struct ggml_backend_i iface;

			
 
				+        ggml_backend_context_t context;

			
 
				+    };

			
 
				+

			
 
				+    struct ggml_backend_event {

			
 
				+        ggml_backend_t backend;

			
 
				+        void * context;

			
 
				+    };

			
 
				+

			
 
				+    //

			
 
				+    // Backend registry

			
 
				+    //

			
 
				+

			
 
				+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);

			
 
				+

			
 
				+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);

			
 
				+

			
 
				+#ifdef  __cplusplus

			
 
				+}

			
 
				+#endif

			
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -1377,10 +1377,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 
				     GGML_UNUSED(main_device);
			
 
				 }
			
 
				 
			
 
				+static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
			
 
				+    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
			
 
				+
			
 
				+#if !defined(GGML_USE_HIPBLAS)
			
 
				+    // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
			
 
				+    cudaMemcpy3DPeerParms p = {};
			
 
				+    p.dstDevice = dstDevice;
			
 
				+    p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
			
 
				+    p.srcDevice = srcDevice;
			
 
				+    p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
			
 
				+    p.extent = make_cudaExtent(width, height, 1);
			
 
				+    return cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+#else
			
 
				+    // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
			
 
				+    GGML_UNUSED(dstDevice);
			
 
				+    GGML_UNUSED(srcDevice);
			
 
				+    return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
			
 
				+#endif // !defined(GGML_USE_HIPBLAS)
			
 
				+}
			
 
				+
			
 
				 static void ggml_cuda_op_mul_mat(
			
 
				     ggml_backend_cuda_context & ctx,
			
 
				     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
			
 
				-    const bool convert_src1_to_q8_1) {
			
 
				+    quantize_cuda_t quantize_src1) {
			
 
				 
			
 
				     const int64_t ne00 = src0->ne[0];
			
 
				     const int64_t ne01 = src0->ne[1];
			
@@ -1437,7 +1457,9 @@ static void ggml_cuda_op_mul_mat(
 
				     }
			
 
				 
			
 
				     struct dev_data {
			
 
				-        ggml_cuda_pool_alloc<char>  src0_dd_alloc;
			
 
				+        int cc;
			
 
				+
			
 
				+        ggml_cuda_pool_alloc<char>   src0_dd_alloc;
			
 
				         ggml_cuda_pool_alloc<float> src1_ddf_alloc;
			
 
				         ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
			
 
				         ggml_cuda_pool_alloc<float>   dst_dd_alloc;
			
@@ -1456,6 +1478,8 @@ static void ggml_cuda_op_mul_mat(
 
				     int used_devices = 0;
			
 
				 
			
 
				     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
			
 
				+        dev[id].cc = ggml_cuda_info().devices[id].cc;
			
 
				+
			
 
				         // by default, use all rows
			
 
				         dev[id].row_low  = 0;
			
 
				         dev[id].row_high = ne01;
			
@@ -1506,11 +1530,15 @@ static void ggml_cuda_op_mul_mat(
 
				             dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
			
 
				         }
			
 
				 
			
 
				-        if (convert_src1_to_q8_1) {
			
 
				-            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
			
 
				+        if (quantize_src1) {
			
 
				+            size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
			
 
				+            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
			
 
				+                src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
			
 
				+            }
			
 
				+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
			
 
				 
			
 
				             if (src1_on_device && src1_is_contiguous) {
			
 
				-                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
			
 
				+                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
			
 
				                 CUDA_CHECK(cudaGetLastError());
			
 
				             }
			
 
				         }
			
@@ -1556,7 +1584,12 @@ static void ggml_cuda_op_mul_mat(
 
				                 const int64_t i03 = i0 / ne12;
			
 
				                 const int64_t i02 = i0 % ne12;
			
 
				 
			
 
				-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
			
 
				+                size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
			
 
				+                if (quantize_src1 == quantize_mmq_q8_1_cuda) {
			
 
				+                    src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
			
 
				+                } else {
			
 
				+                    src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
			
 
				+                }
			
 
				 
			
 
				                 // for split tensors the data begins at i0 == i0_offset_low
			
 
				                 char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
			
@@ -1573,10 +1606,17 @@ static void ggml_cuda_op_mul_mat(
 
				                 // copy src0, src1 to device if necessary
			
 
				                 if (src1_is_contiguous) {
			
 
				                     if (id != ctx.device) {
			
 
				-                        if (convert_src1_to_q8_1) {
			
 
				+                        if (quantize_src1) {
			
 
				                             char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
			
 
				-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
			
 
				-                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
			
 
				+                            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
			
 
				+                                const size_t pitch = ne11*sizeof(block_q8_1_mmq);
			
 
				+                                const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
			
 
				+                                const size_t height = src1_padded_col_size/(4*QK8_1);
			
 
				+                                CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
			
 
				+                            } else {
			
 
				+                                CUDA_CHECK(cudaMemcpyPeerAsync(
			
 
				+                                    src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
			
 
				+                            }
			
 
				                         } else {
			
 
				                             float * src1_ddf_i_source = (float *) src1->data;
			
 
				                             src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
			
@@ -1591,8 +1631,8 @@ static void ggml_cuda_op_mul_mat(
 
				                     GGML_ASSERT(false);
			
 
				                 }
			
 
				 
			
 
				-                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
			
 
				-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
			
 
				+                if (quantize_src1 && !src1_is_contiguous) {
			
 
				+                    quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
			
 
				                     CUDA_CHECK(cudaGetLastError());
			
 
				                 }
			
 
				 
			
@@ -1617,22 +1657,8 @@ static void ggml_cuda_op_mul_mat(
 
				                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
			
 
				                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
			
 
				                         dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
			
 
				-#if !defined(GGML_USE_HIPBLAS)
			
 
				-                        // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
			
 
				-                        cudaMemcpy3DPeerParms p = {};
			
 
				-                        p.dstDevice = ctx.device;
			
 
				-                        p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
			
 
				-                        p.srcDevice = id;
			
 
				-                        p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
			
 
				-                        p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
			
 
				-                        CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
			
 
				-#else
			
 
				-                        // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
			
 
				-                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
			
 
				-                                                        dst_dd_i, row_diff*sizeof(float),
			
 
				-                                                        row_diff*sizeof(float), src1_ncols,
			
 
				-                                                        cudaMemcpyDeviceToDevice, stream));
			
 
				-#endif
			
 
				+                        CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
			
 
				+                            dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
			
 
				                     } else {
			
 
				                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
			
 
				                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
			
@@ -1971,13 +1997,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
				         // KQ + KQV multi-batch
			
 
				         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
			
 
				     } else if (use_dequantize_mul_mat_vec) {
			
 
				-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
			
 
				+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
			
 
				     } else if (use_mul_mat_vec_q) {
			
 
				-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
			
 
				+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
			
 
				     } else if (use_mul_mat_q) {
			
 
				-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
			
 
				+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
			
 
				     } else {
			
 
				-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
			
 
				+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
			
 
				     }
			
 
				 }
			
 
				 
			
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
			
 
				+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,47 +1,47 @@
 
				-#include "acc.cuh"
			
 
				-
			
 
				-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
			
 
				-    const int ne10, const int ne11, const int ne12,
			
 
				-    const int nb1, const int nb2, int offset) {
			
 
				-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				-    if (i >= ne) {
			
 
				-        return;
			
 
				-    }
			
 
				-    int src1_idx = i - offset;
			
 
				-    int oz = src1_idx / nb2;
			
 
				-    int oy = (src1_idx - (oz * nb2)) / nb1;
			
 
				-    int ox = src1_idx % nb1;
			
 
				-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
			
 
				-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
			
 
				-    } else {
			
 
				-        dst[i] = x[i];
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
			
 
				-    const int ne10, const int ne11, const int ne12,
			
 
				-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
			
 
				-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
			
 
				-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const ggml_tensor * src1 = dst->src[1];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    const float * src1_d = (const float *)src1->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
			
 
				-
			
 
				-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
			
 
				-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
			
 
				-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
			
 
				-    int offset = dst->op_params[3] / 4; // offset in bytes
			
 
				-
			
 
				-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
			
 
				-}
			
 
				+#include "acc.cuh"

			
 
				+

			
 
				+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,

			
 
				+    const int ne10, const int ne11, const int ne12,

			
 
				+    const int nb1, const int nb2, int offset) {

			
 
				+    const int i = blockDim.x * blockIdx.x + threadIdx.x;

			
 
				+    if (i >= ne) {

			
 
				+        return;

			
 
				+    }

			
 
				+    int src1_idx = i - offset;

			
 
				+    int oz = src1_idx / nb2;

			
 
				+    int oy = (src1_idx - (oz * nb2)) / nb1;

			
 
				+    int ox = src1_idx % nb1;

			
 
				+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {

			
 
				+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];

			
 
				+    } else {

			
 
				+        dst[i] = x[i];

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,

			
 
				+    const int ne10, const int ne11, const int ne12,

			
 
				+    const int nb1, const int nb2, const int offset, cudaStream_t stream) {

			
 
				+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;

			
 
				+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const ggml_tensor * src1 = dst->src[1];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    const float * src1_d = (const float *)src1->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

			
 
				+

			
 
				+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32

			
 
				+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32

			
 
				+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused

			
 
				+    int offset = dst->op_params[3] / 4; // offset in bytes

			
 
				+

			
 
				+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);

			
 
				+}

			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_ACC_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_ACC_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,34 +1,34 @@
 
				-#include "arange.cuh"
			
 
				-
			
 
				-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
			
 
				-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
			
 
				-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				-    if (nidx >= ne0) {
			
 
				-        return;
			
 
				-    }
			
 
				-    dst[nidx] = start + step * nidx;
			
 
				-}
			
 
				-
			
 
				-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
			
 
				-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
			
 
				-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    float start;
			
 
				-    float stop;
			
 
				-    float step;
			
 
				-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
			
 
				-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
			
 
				-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
			
 
				-
			
 
				-    int64_t steps = (int64_t)ceil((stop - start) / step);
			
 
				-    GGML_ASSERT(ggml_nelements(dst) == steps);
			
 
				-
			
 
				-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
			
 
				-}
			
 
				+#include "arange.cuh"

			
 
				+

			
 
				+static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {

			
 
				+    // blockIDx.x: idx of ne0 / BLOCK_SIZE

			
 
				+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				+    if (nidx >= ne0) {

			
 
				+        return;

			
 
				+    }

			
 
				+    dst[nidx] = start + step * nidx;

			
 
				+}

			
 
				+

			
 
				+static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {

			
 
				+    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;

			
 
				+    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				+

			
 
				+    float start;

			
 
				+    float stop;

			
 
				+    float step;

			
 
				+    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));

			
 
				+    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));

			
 
				+    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));

			
 
				+

			
 
				+    int64_t steps = (int64_t)ceil((stop - start) / step);

			
 
				+    GGML_ASSERT(ggml_nelements(dst) == steps);

			
 
				+

			
 
				+    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);

			
 
				+}

			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_ARANGE_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_ARANGE_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,103 +1,103 @@
 
				-#include "argsort.cuh"
			
 
				-
			
 
				-template<typename T>
			
 
				-static inline __device__ void ggml_cuda_swap(T & a, T & b) {
			
 
				-    T tmp = a;
			
 
				-    a = b;
			
 
				-    b = tmp;
			
 
				-}
			
 
				-
			
 
				-template<ggml_sort_order order>
			
 
				-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
			
 
				-    // bitonic sort
			
 
				-    int col = threadIdx.x;
			
 
				-    int row = blockIdx.y;
			
 
				-
			
 
				-    if (col >= ncols_pad) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const float * x_row = x + row * ncols;
			
 
				-    extern __shared__ int dst_row[];
			
 
				-
			
 
				-    // initialize indices
			
 
				-    dst_row[col] = col;
			
 
				-
			
 
				-    __syncthreads();
			
 
				-
			
 
				-    for (int k = 2; k <= ncols_pad; k *= 2) {
			
 
				-        for (int j = k / 2; j > 0; j /= 2) {
			
 
				-            int ixj = col ^ j;
			
 
				-            if (ixj > col) {
			
 
				-                if ((col & k) == 0) {
			
 
				-                    if (dst_row[col] >= ncols ||
			
 
				-                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
			
 
				-                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
			
 
				-                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
			
 
				-                    ) {
			
 
				-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
			
 
				-                    }
			
 
				-                } else {
			
 
				-                    if (dst_row[ixj] >= ncols ||
			
 
				-                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
			
 
				-                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
			
 
				-                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
			
 
				-                    ) {
			
 
				-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				-            __syncthreads();
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    // copy the result to dst without the padding
			
 
				-    if (col < ncols) {
			
 
				-        dst[row * ncols + col] = dst_row[col];
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static int next_power_of_2(int x) {
			
 
				-    int n = 1;
			
 
				-    while (n < x) {
			
 
				-        n *= 2;
			
 
				-    }
			
 
				-    return n;
			
 
				-}
			
 
				-
			
 
				-static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
			
 
				-    // bitonic sort requires ncols to be power of 2
			
 
				-    const int ncols_pad = next_power_of_2(ncols);
			
 
				-
			
 
				-    const dim3 block_dims(ncols_pad, 1, 1);
			
 
				-    const dim3 block_nums(1, nrows, 1);
			
 
				-    const size_t shared_mem = ncols_pad * sizeof(int);
			
 
				-
			
 
				-    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
			
 
				-
			
 
				-    if (order == GGML_SORT_ORDER_ASC) {
			
 
				-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
			
 
				-    } else if (order == GGML_SORT_ORDER_DESC) {
			
 
				-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
			
 
				-    } else {
			
 
				-        GGML_ASSERT(false);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
			
 
				-    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				-
			
 
				-    const int64_t ncols = src0->ne[0];
			
 
				-    const int64_t nrows = ggml_nrows(src0);
			
 
				-
			
 
				-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
			
 
				-
			
 
				-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
			
 
				-}
			
 
				+#include "argsort.cuh"

			
 
				+

			
 
				+template<typename T>

			
 
				+static inline __device__ void ggml_cuda_swap(T & a, T & b) {

			
 
				+    T tmp = a;

			
 
				+    a = b;

			
 
				+    b = tmp;

			
 
				+}

			
 
				+

			
 
				+template<ggml_sort_order order>

			
 
				+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {

			
 
				+    // bitonic sort

			
 
				+    int col = threadIdx.x;

			
 
				+    int row = blockIdx.y;

			
 
				+

			
 
				+    if (col >= ncols_pad) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const float * x_row = x + row * ncols;

			
 
				+    extern __shared__ int dst_row[];

			
 
				+

			
 
				+    // initialize indices

			
 
				+    dst_row[col] = col;

			
 
				+

			
 
				+    __syncthreads();

			
 
				+

			
 
				+    for (int k = 2; k <= ncols_pad; k *= 2) {

			
 
				+        for (int j = k / 2; j > 0; j /= 2) {

			
 
				+            int ixj = col ^ j;

			
 
				+            if (ixj > col) {

			
 
				+                if ((col & k) == 0) {

			
 
				+                    if (dst_row[col] >= ncols ||

			
 
				+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?

			
 
				+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :

			
 
				+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))

			
 
				+                    ) {

			
 
				+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);

			
 
				+                    }

			
 
				+                } else {

			
 
				+                    if (dst_row[ixj] >= ncols ||

			
 
				+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?

			
 
				+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :

			
 
				+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))

			
 
				+                    ) {

			
 
				+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);

			
 
				+                    }

			
 
				+                }

			
 
				+            }

			
 
				+            __syncthreads();

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    // copy the result to dst without the padding

			
 
				+    if (col < ncols) {

			
 
				+        dst[row * ncols + col] = dst_row[col];

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+static int next_power_of_2(int x) {

			
 
				+    int n = 1;

			
 
				+    while (n < x) {

			
 
				+        n *= 2;

			
 
				+    }

			
 
				+    return n;

			
 
				+}

			
 
				+

			
 
				+static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {

			
 
				+    // bitonic sort requires ncols to be power of 2

			
 
				+    const int ncols_pad = next_power_of_2(ncols);

			
 
				+

			
 
				+    const dim3 block_dims(ncols_pad, 1, 1);

			
 
				+    const dim3 block_nums(1, nrows, 1);

			
 
				+    const size_t shared_mem = ncols_pad * sizeof(int);

			
 
				+

			
 
				+    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);

			
 
				+

			
 
				+    if (order == GGML_SORT_ORDER_ASC) {

			
 
				+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);

			
 
				+    } else if (order == GGML_SORT_ORDER_DESC) {

			
 
				+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);

			
 
				+    } else {

			
 
				+        GGML_ASSERT(false);

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_I32);

			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));

			
 
				+

			
 
				+    const int64_t ncols = src0->ne[0];

			
 
				+    const int64_t nrows = ggml_nrows(src0);

			
 
				+

			
 
				+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

			
 
				+

			
 
				+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);

			
 
				+}

			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,3 +1,3 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,280 +1,280 @@
 
				-#include "binbcast.cuh"
			
 
				-
			
 
				-static __device__ __forceinline__ float op_repeat(const float a, const float b) {
			
 
				-    return b;
			
 
				-    GGML_UNUSED(a);
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ float op_add(const float a, const float b) {
			
 
				-    return a + b;
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ float op_mul(const float a, const float b) {
			
 
				-    return a * b;
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ float op_div(const float a, const float b) {
			
 
				-    return a / b;
			
 
				-}
			
 
				-
			
 
				-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
			
 
				-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
			
 
				-        int ne0, int ne1, int ne2, int ne3,
			
 
				-        int ne10, int ne11, int ne12, int ne13,
			
 
				-        /*int s0, */ int s1,  int s2,  int s3,
			
 
				-        /*int s00,*/ int s01, int s02, int s03,
			
 
				-        /*int s10,*/ int s11, int s12, int s13) {
			
 
				-    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				-    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
			
 
				-    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
			
 
				-    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
			
 
				-
			
 
				-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int i11 = i1 % ne11;
			
 
				-    const int i12 = i2 % ne12;
			
 
				-    const int i13 = i3 % ne13;
			
 
				-
			
 
				-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
			
 
				-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
			
 
				-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
			
 
				-
			
 
				-    const src0_t * src0_row = src0 + i_src0;
			
 
				-    const src1_t * src1_row = src1 + i_src1;
			
 
				-    dst_t * dst_row = dst + i_dst;
			
 
				-
			
 
				-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
			
 
				-        const int i10 = i0 % ne10;
			
 
				-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
			
 
				-static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
			
 
				-        int ne0, int ne1, int ne2, int ne3,
			
 
				-        int ne10, int ne11, int ne12, int ne13,
			
 
				-        /*int s0, */ int s1,  int s2,  int s3,
			
 
				-        /*int s00,*/ int s01, int s02, int s03,
			
 
				-        /*int s10,*/ int s11, int s12, int s13) {
			
 
				-
			
 
				-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				-
			
 
				-    const int i3 = i/(ne2*ne1*ne0);
			
 
				-    const int i2 = (i/(ne1*ne0)) % ne2;
			
 
				-    const int i1 = (i/ne0) % ne1;
			
 
				-    const int i0 = i % ne0;
			
 
				-
			
 
				-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int i11 = i1 % ne11;
			
 
				-    const int i12 = i2 % ne12;
			
 
				-    const int i13 = i3 % ne13;
			
 
				-
			
 
				-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
			
 
				-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
			
 
				-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
			
 
				-
			
 
				-    const src0_t * src0_row = src0 + i_src0;
			
 
				-    const src1_t * src1_row = src1 + i_src1;
			
 
				-    dst_t * dst_row = dst + i_dst;
			
 
				-
			
 
				-    const int i10 = i0 % ne10;
			
 
				-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
			
 
				-}
			
 
				-
			
 
				-template<float (*bin_op)(const float, const float)>
			
 
				-struct bin_bcast_cuda {
			
 
				-    template<typename src0_t, typename src1_t, typename dst_t>
			
 
				-    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
			
 
				-            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
			
 
				-            cudaStream_t stream) {
			
 
				-
			
 
				-        GGML_TENSOR_BINARY_OP_LOCALS
			
 
				-
			
 
				-        int nr0 = ne10/ne0;
			
 
				-        int nr1 = ne11/ne1;
			
 
				-        int nr2 = ne12/ne2;
			
 
				-        int nr3 = ne13/ne3;
			
 
				-
			
 
				-        int nr[4] = { nr0, nr1, nr2, nr3 };
			
 
				-
			
 
				-        // collapse dimensions until first broadcast dimension
			
 
				-        int64_t cne[] = {ne0, ne1, ne2, ne3};
			
 
				-        int64_t cne0[] = {ne00, ne01, ne02, ne03};
			
 
				-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
			
 
				-
			
 
				-        size_t cnb[] = {nb0, nb1, nb2, nb3};
			
 
				-        size_t cnb0[] = {nb00, nb01, nb02, nb03};
			
 
				-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
			
 
				-
			
 
				-        auto collapse = [](int64_t cne[]) {
			
 
				-            cne[0] *= cne[1];
			
 
				-            cne[1] = cne[2];
			
 
				-            cne[2] = cne[3];
			
 
				-            cne[3] = 1;
			
 
				-        };
			
 
				-
			
 
				-        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
			
 
				-            cnb[1] *= cne[1];
			
 
				-            cnb[2] *= cne[2];
			
 
				-            cnb[3] *= cne[3];
			
 
				-        };
			
 
				-
			
 
				-        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
			
 
				-            for (int i = 0; i < 4; i++) {
			
 
				-                if (nr[i] != 1) {
			
 
				-                    break;
			
 
				-                }
			
 
				-                if (i > 0) {
			
 
				-                    collapse_nb(cnb, cne);
			
 
				-                    collapse_nb(cnb0, cne0);
			
 
				-                    collapse_nb(cnb1, cne1);
			
 
				-                    collapse(cne);
			
 
				-                    collapse(cne0);
			
 
				-                    collapse(cne1);
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        {
			
 
				-            int64_t ne0 = cne[0];
			
 
				-            int64_t ne1 = cne[1];
			
 
				-            int64_t ne2 = cne[2];
			
 
				-            int64_t ne3 = cne[3];
			
 
				-
			
 
				-            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
			
 
				-            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
			
 
				-            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
			
 
				-            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
			
 
				-
			
 
				-            int64_t ne10 = cne1[0];
			
 
				-            int64_t ne11 = cne1[1];
			
 
				-            int64_t ne12 = cne1[2];
			
 
				-            int64_t ne13 = cne1[3];
			
 
				-
			
 
				-            size_t nb0 = cnb[0];
			
 
				-            size_t nb1 = cnb[1];
			
 
				-            size_t nb2 = cnb[2];
			
 
				-            size_t nb3 = cnb[3];
			
 
				-
			
 
				-            size_t nb00 = cnb0[0];
			
 
				-            size_t nb01 = cnb0[1];
			
 
				-            size_t nb02 = cnb0[2];
			
 
				-            size_t nb03 = cnb0[3];
			
 
				-
			
 
				-            size_t nb10 = cnb1[0];
			
 
				-            size_t nb11 = cnb1[1];
			
 
				-            size_t nb12 = cnb1[2];
			
 
				-            size_t nb13 = cnb1[3];
			
 
				-
			
 
				-            size_t s0 = nb0 / sizeof(dst_t);
			
 
				-            size_t s1 = nb1 / sizeof(dst_t);
			
 
				-            size_t s2 = nb2 / sizeof(dst_t);
			
 
				-            size_t s3 = nb3 / sizeof(dst_t);
			
 
				-
			
 
				-            size_t s10 = nb10 / sizeof(src1_t);
			
 
				-            size_t s11 = nb11 / sizeof(src1_t);
			
 
				-            size_t s12 = nb12 / sizeof(src1_t);
			
 
				-            size_t s13 = nb13 / sizeof(src1_t);
			
 
				-
			
 
				-            size_t s00 = nb00 / sizeof(src0_t);
			
 
				-            size_t s01 = nb01 / sizeof(src0_t);
			
 
				-            size_t s02 = nb02 / sizeof(src0_t);
			
 
				-            size_t s03 = nb03 / sizeof(src0_t);
			
 
				-
			
 
				-            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
			
 
				-            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
			
 
				-            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
			
 
				-            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
			
 
				-
			
 
				-            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
			
 
				-            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
			
 
				-            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
			
 
				-            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
			
 
				-
			
 
				-            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
			
 
				-            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
			
 
				-            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
			
 
				-            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
			
 
				-
			
 
				-            GGML_ASSERT(s0 == 1);
			
 
				-            GGML_ASSERT(s00 == 1);
			
 
				-            GGML_ASSERT(s10 == 1);
			
 
				-
			
 
				-            const int block_size = 128;
			
 
				-
			
 
				-            int64_t hne0 = std::max(ne0/2LL, 1LL);
			
 
				-
			
 
				-            dim3 block_dims;
			
 
				-            block_dims.x = std::min<unsigned int>(hne0, block_size);
			
 
				-            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
			
 
				-            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
			
 
				-
			
 
				-            dim3 block_nums(
			
 
				-                (hne0 + block_dims.x - 1) / block_dims.x,
			
 
				-                (ne1 + block_dims.y - 1) / block_dims.y,
			
 
				-                (ne2*ne3 + block_dims.z - 1) / block_dims.z
			
 
				-            );
			
 
				-
			
 
				-            if (block_nums.z > 65535) {
			
 
				-                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
			
 
				-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
			
 
				-                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
			
 
				-                    src0_dd, src1_dd, dst_dd,
			
 
				-                    ne0, ne1, ne2, ne3,
			
 
				-                    ne10, ne11, ne12, ne13,
			
 
				-                    /* s0, */ s1, s2, s3,
			
 
				-                    /* s00, */ s01, s02, s03,
			
 
				-                    /* s10, */ s11, s12, s13);
			
 
				-            } else {
			
 
				-                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
			
 
				-                    src0_dd, src1_dd, dst_dd,
			
 
				-                    ne0, ne1, ne2, ne3,
			
 
				-                    ne10, ne11, ne12, ne13,
			
 
				-                    /* s0, */ s1, s2, s3,
			
 
				-                    /* s00, */ s01, s02, s03,
			
 
				-                    /* s10, */ s11, s12, s13);
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-template<class op>
			
 
				-static void ggml_cuda_op_bin_bcast(
			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
 
				-    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
			
 
				-
			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
			
 
				-        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
			
 
				-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
			
 
				-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
			
 
				-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
			
 
				-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
			
 
				-    } else {
			
 
				-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
			
 
				-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
			
 
				-        GGML_ASSERT(false);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				-}
			
 
				+#include "binbcast.cuh"

			
 
				+

			
 
				+static __device__ __forceinline__ float op_repeat(const float a, const float b) {

			
 
				+    return b;

			
 
				+    GGML_UNUSED(a);

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ float op_add(const float a, const float b) {

			
 
				+    return a + b;

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ float op_mul(const float a, const float b) {

			
 
				+    return a * b;

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ float op_div(const float a, const float b) {

			
 
				+    return a / b;

			
 
				+}

			
 
				+

			
 
				+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>

			
 
				+static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,

			
 
				+        int ne0, int ne1, int ne2, int ne3,

			
 
				+        int ne10, int ne11, int ne12, int ne13,

			
 
				+        /*int s0, */ int s1,  int s2,  int s3,

			
 
				+        /*int s00,*/ int s01, int s02, int s03,

			
 
				+        /*int s10,*/ int s11, int s12, int s13) {

			
 
				+    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				+    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);

			
 
				+    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;

			
 
				+    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;

			
 
				+

			
 
				+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int i11 = i1 % ne11;

			
 
				+    const int i12 = i2 % ne12;

			
 
				+    const int i13 = i3 % ne13;

			
 
				+

			
 
				+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;

			
 
				+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;

			
 
				+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

			
 
				+

			
 
				+    const src0_t * src0_row = src0 + i_src0;

			
 
				+    const src1_t * src1_row = src1 + i_src1;

			
 
				+    dst_t * dst_row = dst + i_dst;

			
 
				+

			
 
				+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {

			
 
				+        const int i10 = i0 % ne10;

			
 
				+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>

			
 
				+static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,

			
 
				+        int ne0, int ne1, int ne2, int ne3,

			
 
				+        int ne10, int ne11, int ne12, int ne13,

			
 
				+        /*int s0, */ int s1,  int s2,  int s3,

			
 
				+        /*int s00,*/ int s01, int s02, int s03,

			
 
				+        /*int s10,*/ int s11, int s12, int s13) {

			
 
				+

			
 
				+    const int i = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				+

			
 
				+    const int i3 = i/(ne2*ne1*ne0);

			
 
				+    const int i2 = (i/(ne1*ne0)) % ne2;

			
 
				+    const int i1 = (i/ne0) % ne1;

			
 
				+    const int i0 = i % ne0;

			
 
				+

			
 
				+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int i11 = i1 % ne11;

			
 
				+    const int i12 = i2 % ne12;

			
 
				+    const int i13 = i3 % ne13;

			
 
				+

			
 
				+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;

			
 
				+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;

			
 
				+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

			
 
				+

			
 
				+    const src0_t * src0_row = src0 + i_src0;

			
 
				+    const src1_t * src1_row = src1 + i_src1;

			
 
				+    dst_t * dst_row = dst + i_dst;

			
 
				+

			
 
				+    const int i10 = i0 % ne10;

			
 
				+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);

			
 
				+}

			
 
				+

			
 
				+template<float (*bin_op)(const float, const float)>

			
 
				+struct bin_bcast_cuda {

			
 
				+    template<typename src0_t, typename src1_t, typename dst_t>

			
 
				+    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,

			
 
				+            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,

			
 
				+            cudaStream_t stream) {

			
 
				+

			
 
				+        GGML_TENSOR_BINARY_OP_LOCALS

			
 
				+

			
 
				+        int nr0 = ne10/ne0;

			
 
				+        int nr1 = ne11/ne1;

			
 
				+        int nr2 = ne12/ne2;

			
 
				+        int nr3 = ne13/ne3;

			
 
				+

			
 
				+        int nr[4] = { nr0, nr1, nr2, nr3 };

			
 
				+

			
 
				+        // collapse dimensions until first broadcast dimension

			
 
				+        int64_t cne[] = {ne0, ne1, ne2, ne3};

			
 
				+        int64_t cne0[] = {ne00, ne01, ne02, ne03};

			
 
				+        int64_t cne1[] = {ne10, ne11, ne12, ne13};

			
 
				+

			
 
				+        size_t cnb[] = {nb0, nb1, nb2, nb3};

			
 
				+        size_t cnb0[] = {nb00, nb01, nb02, nb03};

			
 
				+        size_t cnb1[] = {nb10, nb11, nb12, nb13};

			
 
				+

			
 
				+        auto collapse = [](int64_t cne[]) {

			
 
				+            cne[0] *= cne[1];

			
 
				+            cne[1] = cne[2];

			
 
				+            cne[2] = cne[3];

			
 
				+            cne[3] = 1;

			
 
				+        };

			
 
				+

			
 
				+        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {

			
 
				+            cnb[1] *= cne[1];

			
 
				+            cnb[2] *= cne[2];

			
 
				+            cnb[3] *= cne[3];

			
 
				+        };

			
 
				+

			
 
				+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {

			
 
				+            for (int i = 0; i < 4; i++) {

			
 
				+                if (nr[i] != 1) {

			
 
				+                    break;

			
 
				+                }

			
 
				+                if (i > 0) {

			
 
				+                    collapse_nb(cnb, cne);

			
 
				+                    collapse_nb(cnb0, cne0);

			
 
				+                    collapse_nb(cnb1, cne1);

			
 
				+                    collapse(cne);

			
 
				+                    collapse(cne0);

			
 
				+                    collapse(cne1);

			
 
				+                }

			
 
				+            }

			
 
				+        }

			
 
				+

			
 
				+        {

			
 
				+            int64_t ne0 = cne[0];

			
 
				+            int64_t ne1 = cne[1];

			
 
				+            int64_t ne2 = cne[2];

			
 
				+            int64_t ne3 = cne[3];

			
 
				+

			
 
				+            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);

			
 
				+            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);

			
 
				+            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);

			
 
				+            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);

			
 
				+

			
 
				+            int64_t ne10 = cne1[0];

			
 
				+            int64_t ne11 = cne1[1];

			
 
				+            int64_t ne12 = cne1[2];

			
 
				+            int64_t ne13 = cne1[3];

			
 
				+

			
 
				+            size_t nb0 = cnb[0];

			
 
				+            size_t nb1 = cnb[1];

			
 
				+            size_t nb2 = cnb[2];

			
 
				+            size_t nb3 = cnb[3];

			
 
				+

			
 
				+            size_t nb00 = cnb0[0];

			
 
				+            size_t nb01 = cnb0[1];

			
 
				+            size_t nb02 = cnb0[2];

			
 
				+            size_t nb03 = cnb0[3];

			
 
				+

			
 
				+            size_t nb10 = cnb1[0];

			
 
				+            size_t nb11 = cnb1[1];

			
 
				+            size_t nb12 = cnb1[2];

			
 
				+            size_t nb13 = cnb1[3];

			
 
				+

			
 
				+            size_t s0 = nb0 / sizeof(dst_t);

			
 
				+            size_t s1 = nb1 / sizeof(dst_t);

			
 
				+            size_t s2 = nb2 / sizeof(dst_t);

			
 
				+            size_t s3 = nb3 / sizeof(dst_t);

			
 
				+

			
 
				+            size_t s10 = nb10 / sizeof(src1_t);

			
 
				+            size_t s11 = nb11 / sizeof(src1_t);

			
 
				+            size_t s12 = nb12 / sizeof(src1_t);

			
 
				+            size_t s13 = nb13 / sizeof(src1_t);

			
 
				+

			
 
				+            size_t s00 = nb00 / sizeof(src0_t);

			
 
				+            size_t s01 = nb01 / sizeof(src0_t);

			
 
				+            size_t s02 = nb02 / sizeof(src0_t);

			
 
				+            size_t s03 = nb03 / sizeof(src0_t);

			
 
				+

			
 
				+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);

			
 
				+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);

			
 
				+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);

			
 
				+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);

			
 
				+

			
 
				+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);

			
 
				+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);

			
 
				+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);

			
 
				+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);

			
 
				+

			
 
				+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);

			
 
				+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);

			
 
				+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);

			
 
				+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

			
 
				+

			
 
				+            GGML_ASSERT(s0 == 1);

			
 
				+            GGML_ASSERT(s00 == 1);

			
 
				+            GGML_ASSERT(s10 == 1);

			
 
				+

			
 
				+            const int block_size = 128;

			
 
				+

			
 
				+            int64_t hne0 = std::max(ne0/2LL, 1LL);

			
 
				+

			
 
				+            dim3 block_dims;

			
 
				+            block_dims.x = std::min<unsigned int>(hne0, block_size);

			
 
				+            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);

			
 
				+            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);

			
 
				+

			
 
				+            dim3 block_nums(

			
 
				+                (hne0 + block_dims.x - 1) / block_dims.x,

			
 
				+                (ne1 + block_dims.y - 1) / block_dims.y,

			
 
				+                (ne2*ne3 + block_dims.z - 1) / block_dims.z

			
 
				+            );

			
 
				+

			
 
				+            if (block_nums.z > 65535) {

			
 
				+                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel

			
 
				+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;

			
 
				+                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(

			
 
				+                    src0_dd, src1_dd, dst_dd,

			
 
				+                    ne0, ne1, ne2, ne3,

			
 
				+                    ne10, ne11, ne12, ne13,

			
 
				+                    /* s0, */ s1, s2, s3,

			
 
				+                    /* s00, */ s01, s02, s03,

			
 
				+                    /* s10, */ s11, s12, s13);

			
 
				+            } else {

			
 
				+                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(

			
 
				+                    src0_dd, src1_dd, dst_dd,

			
 
				+                    ne0, ne1, ne2, ne3,

			
 
				+                    ne10, ne11, ne12, ne13,

			
 
				+                    /* s0, */ s1, s2, s3,

			
 
				+                    /* s00, */ s01, s02, s03,

			
 
				+                    /* s10, */ s11, s12, s13);

			
 
				+            }

			
 
				+        }

			
 
				+    }

			
 
				+};

			
 
				+

			
 
				+template<class op>

			
 
				+static void ggml_cuda_op_bin_bcast(

			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

			
 
				+    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {

			
 
				+

			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				+

			
 
				+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {

			
 
				+        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);

			
 
				+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {

			
 
				+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);

			
 
				+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {

			
 
				+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);

			
 
				+    } else {

			
 
				+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,

			
 
				+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));

			
 
				+        GGML_ASSERT(false);

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

			
 
				+}

			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,6 +1,6 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CLAMP_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_CLAMP_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CONCAT_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_CONCAT_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,13 +1,13 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
			
 
				-
			
 
				-template<typename T>
			
 
				-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
			
 
				-
			
 
				-typedef to_t_cuda_t<float> to_fp32_cuda_t;
			
 
				-typedef to_t_cuda_t<half> to_fp16_cuda_t;
			
 
				-
			
 
				-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
			
 
				-
			
 
				-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256

			
 
				+

			
 
				+template<typename T>

			
 
				+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);

			
 
				+

			
 
				+typedef to_t_cuda_t<float> to_fp32_cuda_t;

			
 
				+typedef to_t_cuda_t<half> to_fp16_cuda_t;

			
 
				+

			
 
				+to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);

			
 
				+

			
 
				+to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);

			
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,103 +1,103 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				-    const block_q4_0 * x = (const block_q4_0 *) vx;
			
 
				-
			
 
				-    const dfloat d = x[ib].d;
			
 
				-
			
 
				-    const int vui = x[ib].qs[iqs];
			
 
				-
			
 
				-    v.x = vui & 0xF;
			
 
				-    v.y = vui >> 4;
			
 
				-
			
 
				-#ifdef GGML_CUDA_F16
			
 
				-    v = __hsub2(v, {8.0f, 8.0f});
			
 
				-    v = __hmul2(v, {d, d});
			
 
				-#else
			
 
				-    v.x = (v.x - 8.0f) * d;
			
 
				-    v.y = (v.y - 8.0f) * d;
			
 
				-#endif // GGML_CUDA_F16
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				-    const block_q4_1 * x = (const block_q4_1 *) vx;
			
 
				-
			
 
				-    const dfloat d = __low2half(x[ib].dm);
			
 
				-    const dfloat m = __high2half(x[ib].dm);
			
 
				-
			
 
				-    const int vui = x[ib].qs[iqs];
			
 
				-
			
 
				-    v.x = vui & 0xF;
			
 
				-    v.y = vui >> 4;
			
 
				-
			
 
				-#ifdef GGML_CUDA_F16
			
 
				-    v = __hmul2(v, {d, d});
			
 
				-    v = __hadd2(v, {m, m});
			
 
				-#else
			
 
				-    v.x = (v.x * d) + m;
			
 
				-    v.y = (v.y * d) + m;
			
 
				-#endif // GGML_CUDA_F16
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				-    const block_q5_0 * x = (const block_q5_0 *) vx;
			
 
				-
			
 
				-    const dfloat d = x[ib].d;
			
 
				-
			
 
				-    uint32_t qh;
			
 
				-    memcpy(&qh, x[ib].qh, sizeof(qh));
			
 
				-
			
 
				-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
			
 
				-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
			
 
				-
			
 
				-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
			
 
				-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
			
 
				-
			
 
				-#ifdef GGML_CUDA_F16
			
 
				-    v = __hsub2(v, {16.0f, 16.0f});
			
 
				-    v = __hmul2(v, {d, d});
			
 
				-#else
			
 
				-    v.x = (v.x - 16.0f) * d;
			
 
				-    v.y = (v.y - 16.0f) * d;
			
 
				-#endif // GGML_CUDA_F16
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				-    const block_q5_1 * x = (const block_q5_1 *) vx;
			
 
				-
			
 
				-    const dfloat d = __low2half(x[ib].dm);
			
 
				-    const dfloat m = __high2half(x[ib].dm);
			
 
				-
			
 
				-    uint32_t qh;
			
 
				-    memcpy(&qh, x[ib].qh, sizeof(qh));
			
 
				-
			
 
				-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
			
 
				-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
			
 
				-
			
 
				-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
			
 
				-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
			
 
				-
			
 
				-#ifdef GGML_CUDA_F16
			
 
				-    v = __hmul2(v, {d, d});
			
 
				-    v = __hadd2(v, {m, m});
			
 
				-#else
			
 
				-    v.x = (v.x * d) + m;
			
 
				-    v.y = (v.y * d) + m;
			
 
				-#endif // GGML_CUDA_F16
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
			
 
				-    const block_q8_0 * x = (const block_q8_0 *) vx;
			
 
				-
			
 
				-    const dfloat d = x[ib].d;
			
 
				-
			
 
				-    v.x = x[ib].qs[iqs + 0];
			
 
				-    v.y = x[ib].qs[iqs + 1];
			
 
				-
			
 
				-#ifdef GGML_CUDA_F16
			
 
				-    v = __hmul2(v, {d, d});
			
 
				-#else
			
 
				-    v.x *= d;
			
 
				-    v.y *= d;
			
 
				-#endif // GGML_CUDA_F16
			
 
				-}
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				+    const block_q4_0 * x = (const block_q4_0 *) vx;

			
 
				+

			
 
				+    const dfloat d = x[ib].d;

			
 
				+

			
 
				+    const int vui = x[ib].qs[iqs];

			
 
				+

			
 
				+    v.x = vui & 0xF;

			
 
				+    v.y = vui >> 4;

			
 
				+

			
 
				+#ifdef GGML_CUDA_F16

			
 
				+    v = __hsub2(v, {8.0f, 8.0f});

			
 
				+    v = __hmul2(v, {d, d});

			
 
				+#else

			
 
				+    v.x = (v.x - 8.0f) * d;

			
 
				+    v.y = (v.y - 8.0f) * d;

			
 
				+#endif // GGML_CUDA_F16

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				+    const block_q4_1 * x = (const block_q4_1 *) vx;

			
 
				+

			
 
				+    const dfloat d = __low2half(x[ib].dm);

			
 
				+    const dfloat m = __high2half(x[ib].dm);

			
 
				+

			
 
				+    const int vui = x[ib].qs[iqs];

			
 
				+

			
 
				+    v.x = vui & 0xF;

			
 
				+    v.y = vui >> 4;

			
 
				+

			
 
				+#ifdef GGML_CUDA_F16

			
 
				+    v = __hmul2(v, {d, d});

			
 
				+    v = __hadd2(v, {m, m});

			
 
				+#else

			
 
				+    v.x = (v.x * d) + m;

			
 
				+    v.y = (v.y * d) + m;

			
 
				+#endif // GGML_CUDA_F16

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				+    const block_q5_0 * x = (const block_q5_0 *) vx;

			
 
				+

			
 
				+    const dfloat d = x[ib].d;

			
 
				+

			
 
				+    uint32_t qh;

			
 
				+    memcpy(&qh, x[ib].qh, sizeof(qh));

			
 
				+

			
 
				+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;

			
 
				+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

			
 
				+

			
 
				+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);

			
 
				+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

			
 
				+

			
 
				+#ifdef GGML_CUDA_F16

			
 
				+    v = __hsub2(v, {16.0f, 16.0f});

			
 
				+    v = __hmul2(v, {d, d});

			
 
				+#else

			
 
				+    v.x = (v.x - 16.0f) * d;

			
 
				+    v.y = (v.y - 16.0f) * d;

			
 
				+#endif // GGML_CUDA_F16

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				+    const block_q5_1 * x = (const block_q5_1 *) vx;

			
 
				+

			
 
				+    const dfloat d = __low2half(x[ib].dm);

			
 
				+    const dfloat m = __high2half(x[ib].dm);

			
 
				+

			
 
				+    uint32_t qh;

			
 
				+    memcpy(&qh, x[ib].qh, sizeof(qh));

			
 
				+

			
 
				+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;

			
 
				+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

			
 
				+

			
 
				+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);

			
 
				+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

			
 
				+

			
 
				+#ifdef GGML_CUDA_F16

			
 
				+    v = __hmul2(v, {d, d});

			
 
				+    v = __hadd2(v, {m, m});

			
 
				+#else

			
 
				+    v.x = (v.x * d) + m;

			
 
				+    v.y = (v.y * d) + m;

			
 
				+#endif // GGML_CUDA_F16

			
 
				+}

			
 
				+

			
 
				+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

			
 
				+    const block_q8_0 * x = (const block_q8_0 *) vx;

			
 
				+

			
 
				+    const dfloat d = x[ib].d;

			
 
				+

			
 
				+    v.x = x[ib].qs[iqs + 0];

			
 
				+    v.y = x[ib].qs[iqs + 1];

			
 
				+

			
 
				+#ifdef GGML_CUDA_F16

			
 
				+    v = __hmul2(v, {d, d});

			
 
				+#else

			
 
				+    v.x *= d;

			
 
				+    v.y *= d;

			
 
				+#endif // GGML_CUDA_F16

			
 
				+}

			
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,40 +1,40 @@
 
				-#include "diagmask.cuh"
			
 
				-
			
 
				-static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
			
 
				-    const int col = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				-
			
 
				-    if (col >= ncols) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int i = row*ncols + col;
			
 
				-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
			
 
				-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
			
 
				-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
			
 
				-}
			
 
				-
			
 
				-static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
			
 
				-    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
			
 
				-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
			
 
				-    const dim3 block_nums(nrows_x, block_num_x, 1);
			
 
				-    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    const int64_t ne00 = src0->ne[0];
			
 
				-    const int64_t ne01 = src0->ne[1];
			
 
				-    const int nrows0 = ggml_nrows(src0);
			
 
				-
			
 
				-    const int n_past = ((int32_t *) dst->op_params)[0];
			
 
				-
			
 
				-    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
			
 
				-}
			
 
				+#include "diagmask.cuh"

			
 
				+

			
 
				+static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {

			
 
				+    const int col = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				+    const int row = blockDim.x*blockIdx.x + threadIdx.x;

			
 
				+

			
 
				+    if (col >= ncols) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int i = row*ncols + col;

			
 
				+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];

			
 
				+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU

			
 
				+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;

			
 
				+}

			
 
				+

			
 
				+static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {

			
 
				+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);

			
 
				+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;

			
 
				+    const dim3 block_nums(nrows_x, block_num_x, 1);

			
 
				+    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				+

			
 
				+    const int64_t ne00 = src0->ne[0];

			
 
				+    const int64_t ne01 = src0->ne[1];

			
 
				+    const int nrows0 = ggml_nrows(src0);

			
 
				+

			
 
				+    const int n_past = ((int32_t *) dst->op_params)[0];

			
 
				+

			
 
				+    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);

			
 
				+}

			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
			
 
				-
			
 
				-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32

			
 
				+

			
 
				+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,18 +1,18 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-// dmmv = dequantize_mul_mat_vec
			
 
				-
			
 
				-// TODO: remove this?
			
 
				-#ifndef GGML_CUDA_DMMV_X
			
 
				-#define GGML_CUDA_DMMV_X 32
			
 
				-#endif
			
 
				-
			
 
				-#ifndef GGML_CUDA_MMV_Y
			
 
				-#define GGML_CUDA_MMV_Y 1
			
 
				-#endif
			
 
				-
			
 
				-void ggml_cuda_op_dequantize_mul_mat_vec(
			
 
				-    ggml_backend_cuda_context & ctx,
			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+// dmmv = dequantize_mul_mat_vec

			
 
				+

			
 
				+// TODO: remove this?

			
 
				+#ifndef GGML_CUDA_DMMV_X

			
 
				+#define GGML_CUDA_DMMV_X 32

			
 
				+#endif

			
 
				+

			
 
				+#ifndef GGML_CUDA_MMV_Y

			
 
				+#define GGML_CUDA_MMV_Y 1

			
 
				+#endif

			
 
				+

			
 
				+void ggml_cuda_op_dequantize_mul_mat_vec(

			
 
				+    ggml_backend_cuda_context & ctx,

			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

			
 
				+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

			
 
				+    const int64_t src1_padded_row_size, cudaStream_t stream);

			
--- a/llama/ggml-cuda/fattn-vec-f16.cu
+++ b/llama/ggml-cuda/fattn-vec-f16.cu
@@ -1,326 +0,0 @@
 
				-#include "common.cuh"
			
 
				-#include "fattn-common.cuh"
			
 
				-#include "fattn-vec-f16.cuh"
			
 
				-
			
 
				-template<int D, int ncols, int parallel_blocks> // D == head size
			
 
				-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				-__launch_bounds__(D, 1)
			
 
				-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				-static __global__ void flash_attn_vec_ext_f16(
			
 
				-        const char * __restrict__ Q,
			
 
				-        const char * __restrict__ K,
			
 
				-        const char * __restrict__ V,
			
 
				-        const char * __restrict__ mask,
			
 
				-        float      * __restrict__ dst,
			
 
				-        float2     * __restrict__ dst_meta,
			
 
				-        const float scale,
			
 
				-        const float max_bias,
			
 
				-        const float m0,
			
 
				-        const float m1,
			
 
				-        const uint32_t n_head_log2,
			
 
				-        const int ne00,
			
 
				-        const int ne01,
			
 
				-        const int ne02,
			
 
				-        const int ne03,
			
 
				-        const int ne10,
			
 
				-        const int ne11,
			
 
				-        const int ne12,
			
 
				-        const int ne13,
			
 
				-        const int ne31,
			
 
				-        const int nb31,
			
 
				-        const int nb01,
			
 
				-        const int nb02,
			
 
				-        const int nb03,
			
 
				-        const int nb11,
			
 
				-        const int nb12,
			
 
				-        const int nb13,
			
 
				-        const int ne0,
			
 
				-        const int ne1,
			
 
				-        const int ne2,
			
 
				-        const int ne3) {
			
 
				-#if FP16_AVAILABLE
			
 
				-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				-
			
 
				-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
			
 
				-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
			
 
				-
			
 
				-    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
			
 
				-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
			
 
				-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
			
 
				-    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
			
 
				-    const half   * maskh = (const half   *)  mask + ne11*ic0;
			
 
				-
			
 
				-    const int stride_KV  = nb11 / sizeof(half);
			
 
				-    const int stride_KV2 = nb11 / sizeof(half2);
			
 
				-
			
 
				-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
			
 
				-    const half  slopeh = __float2half(slopef);
			
 
				-
			
 
				-    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
			
 
				-    constexpr int nwarps = D / WARP_SIZE;
			
 
				-    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
			
 
				-    __builtin_assume(tid < D);
			
 
				-
			
 
				-    __shared__ half KQ[ncols*D];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        KQ[j*D + tid] = -HALF_MAX_HALF;
			
 
				-    }
			
 
				-    half2 * KQ2 = (half2 *) KQ;
			
 
				-
			
 
				-    half kqmax[ncols];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        kqmax[j] = -HALF_MAX_HALF;
			
 
				-    }
			
 
				-    half kqsum[ncols] = {0.0f};
			
 
				-
			
 
				-    __shared__ half kqmax_shared[ncols][WARP_SIZE];
			
 
				-    __shared__ half kqsum_shared[ncols][WARP_SIZE];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        if (threadIdx.y == 0) {
			
 
				-            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
			
 
				-            kqsum_shared[j][threadIdx.x] = 0.0f;
			
 
				-        }
			
 
				-    }
			
 
				-    __syncthreads();
			
 
				-
			
 
				-    // Convert Q to half2 and store in registers:
			
 
				-    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-#pragma unroll
			
 
				-        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
			
 
				-            const int i = i0 + threadIdx.x;
			
 
				-
			
 
				-            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
			
 
				-            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    half2 VKQ[ncols] = {{0.0f, 0.0f}};
			
 
				-
			
 
				-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
			
 
				-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
			
 
				-        // Calculate KQ tile and keep track of new maximum KQ values:
			
 
				-
			
 
				-        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
			
 
				-        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
			
 
				-        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
			
 
				-        half kqmax_new = kqmax[0];
			
 
				-        half kqmax_new_arr[ncols];
			
 
				-#pragma unroll
			
 
				-        for (int j = 0; j < ncols; ++j) {
			
 
				-            kqmax_new_arr[j] = kqmax[j];
			
 
				-        }
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
			
 
				-            const int i_KQ = i_KQ_0 + threadIdx.y;
			
 
				-
			
 
				-            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            half2 sum2[ncols] = {{0.0f, 0.0f}};
			
 
				-#pragma unroll
			
 
				-            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
			
 
				-                const int k_KQ = k_KQ_0 + threadIdx.x;
			
 
				-
			
 
				-                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
			
 
				-#pragma unroll
			
 
				-                for (int j = 0; j < ncols; ++j) {
			
 
				-                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				-#pragma unroll
			
 
				-            for (int j = 0; j < ncols; ++j) {
			
 
				-                sum2[j] = warp_reduce_sum(sum2[j]);
			
 
				-                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
			
 
				-                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
			
 
				-
			
 
				-                if (ncols == 1) {
			
 
				-                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
			
 
				-                } else {
			
 
				-                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
			
 
				-                }
			
 
				-
			
 
				-                if (threadIdx.x == 0) {
			
 
				-                    KQ[j*D + i_KQ] = sum;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int j = 0; j < ncols; ++j) {
			
 
				-            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
			
 
				-
			
 
				-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
			
 
				-            if (threadIdx.x == 0) {
			
 
				-                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int j = 0; j < ncols; ++j) {
			
 
				-            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
			
 
				-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
			
 
				-
			
 
				-            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
			
 
				-            kqmax[j] = kqmax_new_j;
			
 
				-
			
 
				-            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
			
 
				-            kqsum[j] = kqsum[j]*KQ_max_scale + val;
			
 
				-            KQ[j*D + tid] = val;
			
 
				-
			
 
				-            VKQ[j] *= __half2half2(KQ_max_scale);
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int k0 = 0; k0 < D; k0 += 2) {
			
 
				-            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            half2 V_k;
			
 
				-            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
			
 
				-            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
			
 
				-#pragma unroll
			
 
				-            for (int j = 0; j < ncols; ++j) {
			
 
				-                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-    }
			
 
				-
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        kqsum[j] = warp_reduce_sum(kqsum[j]);
			
 
				-        if (threadIdx.x == 0) {
			
 
				-            kqsum_shared[j][threadIdx.y] = kqsum[j];
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    __syncthreads();
			
 
				-
			
 
				-#pragma unroll
			
 
				-    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
			
 
				-        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
			
 
				-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
			
 
				-
			
 
				-        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
			
 
				-        if (parallel_blocks == 1) {
			
 
				-            dst_val /= kqsum[j_VKQ];
			
 
				-        }
			
 
				-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
			
 
				-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
			
 
				-    }
			
 
				-
			
 
				-    if (parallel_blocks != 1 && tid < ncols) {
			
 
				-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
			
 
				-    }
			
 
				-#else
			
 
				-   NO_DEVICE_CODE;
			
 
				-#endif // FP16_AVAILABLE
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_tensor * KQV = dst;
			
 
				-    ggml_tensor * Q   = dst->src[0];
			
 
				-
			
 
				-    const int32_t precision = KQV->op_params[2];
			
 
				-    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
			
 
				-
			
 
				-    constexpr int cols_per_block  = 1;
			
 
				-    constexpr int parallel_blocks = 4;
			
 
				-    switch (Q->ne[0]) {
			
 
				-        case  64: {
			
 
				-            constexpr int      D = 64;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        case 128: {
			
 
				-            constexpr int      D = 128;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        case 256: {
			
 
				-            constexpr int      D = 256;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        default:
			
 
				-            GGML_ASSERT(false);
			
 
				-            break;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-template <int cols_per_block, int parallel_blocks>
			
 
				-void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * Q = dst->src[0];
			
 
				-    switch (Q->ne[0]) {
			
 
				-        case  64: {
			
 
				-            constexpr int      D = 64;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        case 128: {
			
 
				-            constexpr int      D = 128;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        default: {
			
 
				-            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
			
 
				-        } break;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * KQV = dst;
			
 
				-    const ggml_tensor * Q   = dst->src[0];
			
 
				-
			
 
				-    const int32_t precision = KQV->op_params[2];
			
 
				-    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
			
 
				-
			
 
				-    if (Q->ne[1] == 1) {
			
 
				-        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    if (Q->ne[1] == 2) {
			
 
				-        constexpr int cols_per_block  = 2;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    if (Q->ne[1] <= 4) {
			
 
				-        constexpr int cols_per_block  = 4;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    if (Q->ne[1] <= 8) {
			
 
				-        constexpr int cols_per_block  = 8;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    constexpr int cols_per_block  = 8;
			
 
				-    constexpr int parallel_blocks = 1;
			
 
				-    launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-}
			
--- a/llama/ggml-cuda/fattn-vec-f32.cu
+++ b/llama/ggml-cuda/fattn-vec-f32.cu
@@ -1,275 +0,0 @@
 
				-#include "common.cuh"
			
 
				-#include "fattn-common.cuh"
			
 
				-#include "fattn-vec-f32.cuh"
			
 
				-
			
 
				-template<int D, int ncols, int parallel_blocks> // D == head size
			
 
				-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				-__launch_bounds__(D, 1)
			
 
				-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				-static __global__ void flash_attn_vec_ext_f32(
			
 
				-        const char * __restrict__ Q,
			
 
				-        const char * __restrict__ K,
			
 
				-        const char * __restrict__ V,
			
 
				-        const char * __restrict__ mask,
			
 
				-        float      * __restrict__ dst,
			
 
				-        float2     * __restrict__ dst_meta,
			
 
				-        const float scale,
			
 
				-        const float max_bias,
			
 
				-        const float m0,
			
 
				-        const float m1,
			
 
				-        const uint32_t n_head_log2,
			
 
				-        const int ne00,
			
 
				-        const int ne01,
			
 
				-        const int ne02,
			
 
				-        const int ne03,
			
 
				-        const int ne10,
			
 
				-        const int ne11,
			
 
				-        const int ne12,
			
 
				-        const int ne13,
			
 
				-        const int ne31,
			
 
				-        const int nb31,
			
 
				-        const int nb01,
			
 
				-        const int nb02,
			
 
				-        const int nb03,
			
 
				-        const int nb11,
			
 
				-        const int nb12,
			
 
				-        const int nb13,
			
 
				-        const int ne0,
			
 
				-        const int ne1,
			
 
				-        const int ne2,
			
 
				-        const int ne3) {
			
 
				-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				-
			
 
				-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
			
 
				-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
			
 
				-
			
 
				-    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
			
 
				-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
			
 
				-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
			
 
				-    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
			
 
				-    const half   * maskh = (const half   *)  mask + ne11*ic0;
			
 
				-
			
 
				-    const int stride_KV  = nb11 / sizeof(half);
			
 
				-    const int stride_KV2 = nb11 / sizeof(half2);
			
 
				-
			
 
				-    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
			
 
				-
			
 
				-    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
			
 
				-    constexpr int nwarps = D / WARP_SIZE;
			
 
				-    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
			
 
				-    __builtin_assume(tid < D);
			
 
				-
			
 
				-    __shared__ float KQ[ncols*D];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        KQ[j*D + tid] = -FLT_MAX/2.0f;
			
 
				-    }
			
 
				-
			
 
				-    float kqmax[ncols];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        kqmax[j] = -FLT_MAX/2.0f;
			
 
				-    }
			
 
				-    float kqsum[ncols] = {0.0f};
			
 
				-
			
 
				-    __shared__ float kqmax_shared[ncols][WARP_SIZE];
			
 
				-    __shared__ float kqsum_shared[ncols][WARP_SIZE];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        if (threadIdx.y == 0) {
			
 
				-            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
			
 
				-            kqsum_shared[j][threadIdx.x] = 0.0f;
			
 
				-        }
			
 
				-    }
			
 
				-    __syncthreads();
			
 
				-
			
 
				-    // Convert Q to half2 and store in registers:
			
 
				-    float2 Q_h2[ncols][D/(2*WARP_SIZE)];
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-#pragma unroll
			
 
				-        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
			
 
				-            const int i = i0 + threadIdx.x;
			
 
				-
			
 
				-            Q_h2[j][i0/WARP_SIZE]    = Q_f2[j*(nb01/sizeof(float2)) + i];
			
 
				-            Q_h2[j][i0/WARP_SIZE].x *= scale;
			
 
				-            Q_h2[j][i0/WARP_SIZE].y *= scale;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    float VKQ[ncols] = {0.0f};
			
 
				-
			
 
				-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
			
 
				-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
			
 
				-        // Calculate KQ tile and keep track of new maximum KQ values:
			
 
				-
			
 
				-        float kqmax_new_arr[ncols];
			
 
				-#pragma unroll
			
 
				-        for (int j = 0; j < ncols; ++j) {
			
 
				-            kqmax_new_arr[j] = kqmax[j];
			
 
				-        }
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
			
 
				-            const int i_KQ = i_KQ_0 + threadIdx.y;
			
 
				-
			
 
				-            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            float sum[ncols] = {0.0f};
			
 
				-#pragma unroll
			
 
				-            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
			
 
				-                const int k_KQ = k_KQ_0 + threadIdx.x;
			
 
				-
			
 
				-                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
			
 
				-#pragma unroll
			
 
				-                for (int j = 0; j < ncols; ++j) {
			
 
				-                    sum[j] +=  __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
			
 
				-                    sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				-#pragma unroll
			
 
				-            for (int j = 0; j < ncols; ++j) {
			
 
				-                sum[j] = warp_reduce_sum(sum[j]);
			
 
				-                sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
			
 
				-
			
 
				-                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
			
 
				-
			
 
				-                if (threadIdx.x == 0) {
			
 
				-                    KQ[j*D + i_KQ] = sum[j];
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int j = 0; j < ncols; ++j) {
			
 
				-            float kqmax_new_j = kqmax_new_arr[j];
			
 
				-
			
 
				-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
			
 
				-            if (threadIdx.x == 0) {
			
 
				-                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int j = 0; j < ncols; ++j) {
			
 
				-            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
			
 
				-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
			
 
				-
			
 
				-            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
			
 
				-            kqmax[j] = kqmax_new_j;
			
 
				-
			
 
				-            const float val = expf(KQ[j*D + tid] - kqmax[j]);
			
 
				-            kqsum[j] = kqsum[j]*KQ_max_scale + val;
			
 
				-            KQ[j*D + tid] = val;
			
 
				-
			
 
				-            VKQ[j] *= KQ_max_scale;
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-
			
 
				-#pragma unroll
			
 
				-        for (int k = 0; k < D; ++k) {
			
 
				-            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
			
 
				-                break;
			
 
				-            }
			
 
				-
			
 
				-            const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
			
 
				-#pragma unroll
			
 
				-            for (int j = 0; j < ncols; ++j) {
			
 
				-                VKQ[j] += V_ki*KQ[j*D + k];
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				-        __syncthreads();
			
 
				-    }
			
 
				-
			
 
				-#pragma unroll
			
 
				-    for (int j = 0; j < ncols; ++j) {
			
 
				-        kqsum[j] = warp_reduce_sum(kqsum[j]);
			
 
				-        if (threadIdx.x == 0) {
			
 
				-            kqsum_shared[j][threadIdx.y] = kqsum[j];
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    __syncthreads();
			
 
				-
			
 
				-#pragma unroll
			
 
				-    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
			
 
				-        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
			
 
				-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
			
 
				-
			
 
				-        float dst_val = VKQ[j_VKQ];
			
 
				-        if (parallel_blocks == 1) {
			
 
				-            dst_val /= kqsum[j_VKQ];
			
 
				-        }
			
 
				-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
			
 
				-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
			
 
				-    }
			
 
				-
			
 
				-    if (parallel_blocks != 1 && tid < ncols) {
			
 
				-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-template <int cols_per_block, int parallel_blocks>
			
 
				-void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * Q = dst->src[0];
			
 
				-    switch (Q->ne[0]) {
			
 
				-        case  64: {
			
 
				-            constexpr int      D = 64;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        case 128: {
			
 
				-            constexpr int      D = 128;
			
 
				-            constexpr int nwarps = D/WARP_SIZE;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
			
 
				-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
			
 
				-        } break;
			
 
				-        default: {
			
 
				-            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
			
 
				-        } break;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * Q = dst->src[0];
			
 
				-
			
 
				-    if (Q->ne[1] == 1) {
			
 
				-        constexpr int cols_per_block  = 1;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    if (Q->ne[1] == 2) {
			
 
				-        constexpr int cols_per_block  = 2;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    if (Q->ne[1] <= 4) {
			
 
				-        constexpr int cols_per_block  = 4;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    if (Q->ne[1] <= 8) {
			
 
				-        constexpr int cols_per_block  = 8;
			
 
				-        constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    constexpr int cols_per_block  = 8;
			
 
				-    constexpr int parallel_blocks = 1;
			
 
				-    launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				-}
			
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,178 +1,178 @@
 
				-#include "getrows.cuh"
			
 
				-#include "dequantize.cuh"
			
 
				-
			
 
				-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
			
 
				-static __global__ void k_get_rows(
			
 
				-            const void * src0, const int32_t * src1, dst_t * dst,
			
 
				-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
			
 
				-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
			
 
				-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
			
 
				-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
			
 
				-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
			
 
				-
			
 
				-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
			
 
				-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
			
 
				-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
			
 
				-
			
 
				-    if (i00 >= ne00) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
			
 
				-
			
 
				-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
			
 
				-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
			
 
				-
			
 
				-    const int ib = i00/qk; // block index
			
 
				-    const int iqs = (i00%qk)/qr; // quant index
			
 
				-    const int iybs = i00 - i00%qk; // dst block start index
			
 
				-    const int y_offset = qr == 1 ? 1 : qk/2;
			
 
				-
			
 
				-    // dequantize
			
 
				-    dfloat2 v;
			
 
				-    dequantize_kernel(src0_row, ib, iqs, v);
			
 
				-
			
 
				-    dst_row[iybs + iqs + 0]        = v.x;
			
 
				-    dst_row[iybs + iqs + y_offset] = v.y;
			
 
				-}
			
 
				-
			
 
				-template<typename src0_t, typename dst_t>
			
 
				-static __global__ void k_get_rows_float(
			
 
				-            const src0_t * src0, const int32_t * src1, dst_t * dst,
			
 
				-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
			
 
				-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
			
 
				-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
			
 
				-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
			
 
				-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
			
 
				-
			
 
				-    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
			
 
				-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
			
 
				-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
			
 
				-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
			
 
				-
			
 
				-    if (i00 >= ne00) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
			
 
				-
			
 
				-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
			
 
				-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
			
 
				-
			
 
				-    dst_row[i00] = src0_row[i00];
			
 
				-}
			
 
				-
			
 
				-template<int qk, int qr, dequantize_kernel_t dq>
			
 
				-static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
 
				-                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
			
 
				-
			
 
				-    GGML_TENSOR_BINARY_OP_LOCALS
			
 
				-
			
 
				-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
			
 
				-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
			
 
				-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
			
 
				-
			
 
				-    // strides in elements
			
 
				-    //const size_t s0 = nb0 / ggml_element_size(dst);
			
 
				-    const size_t s1 = nb1 / ggml_element_size(dst);
			
 
				-    const size_t s2 = nb2 / ggml_element_size(dst);
			
 
				-    const size_t s3 = nb3 / ggml_element_size(dst);
			
 
				-
			
 
				-    const size_t s10 = nb10 / ggml_element_size(src1);
			
 
				-    const size_t s11 = nb11 / ggml_element_size(src1);
			
 
				-    const size_t s12 = nb12 / ggml_element_size(src1);
			
 
				-    //const size_t s13 = nb13 / ggml_element_size(src1);
			
 
				-
			
 
				-    GGML_ASSERT(ne00 % 2 == 0);
			
 
				-
			
 
				-    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
			
 
				-            src0_dd, src1_dd, dst_dd,
			
 
				-            ne00, /*ne01, ne02, ne03,*/
			
 
				-            /*ne10, ne11,*/ ne12, /*ne13,*/
			
 
				-            /* s0,*/ s1, s2, s3,
			
 
				-            /* nb00,*/ nb01, nb02, nb03,
			
 
				-            s10, s11, s12/*, s13*/);
			
 
				-
			
 
				-    GGML_UNUSED(dst);
			
 
				-}
			
 
				-
			
 
				-template<typename src0_t>
			
 
				-static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
 
				-                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
			
 
				-
			
 
				-    GGML_TENSOR_BINARY_OP_LOCALS
			
 
				-
			
 
				-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
			
 
				-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
			
 
				-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
			
 
				-
			
 
				-    // strides in elements
			
 
				-    //const size_t s0 = nb0 / ggml_element_size(dst);
			
 
				-    const size_t s1 = nb1 / ggml_element_size(dst);
			
 
				-    const size_t s2 = nb2 / ggml_element_size(dst);
			
 
				-    const size_t s3 = nb3 / ggml_element_size(dst);
			
 
				-
			
 
				-    const size_t s10 = nb10 / ggml_element_size(src1);
			
 
				-    const size_t s11 = nb11 / ggml_element_size(src1);
			
 
				-    const size_t s12 = nb12 / ggml_element_size(src1);
			
 
				-    //const size_t s13 = nb13 / ggml_element_size(src1);
			
 
				-
			
 
				-    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
			
 
				-            src0_dd, src1_dd, dst_dd,
			
 
				-            ne00, /*ne01, ne02, ne03,*/
			
 
				-            /*ne10, ne11,*/ ne12, /*ne13,*/
			
 
				-            /* s0,*/ s1, s2, s3,
			
 
				-            /* nb00,*/ nb01, nb02, nb03,
			
 
				-            s10, s11, s12/*, s13*/);
			
 
				-
			
 
				-    GGML_UNUSED(dst);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const ggml_tensor * src1 = dst->src[1];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    const float * src1_d = (const float *)src1->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-
			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
			
 
				-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
			
 
				-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
			
 
				-
			
 
				-    const int32_t * src1_i32 = (const int32_t *) src1_d;
			
 
				-
			
 
				-    switch (src0->type) {
			
 
				-        case GGML_TYPE_F16:
			
 
				-            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        case GGML_TYPE_F32:
			
 
				-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        case GGML_TYPE_Q4_0:
			
 
				-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        case GGML_TYPE_Q4_1:
			
 
				-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        case GGML_TYPE_Q5_0:
			
 
				-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        case GGML_TYPE_Q5_1:
			
 
				-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        case GGML_TYPE_Q8_0:
			
 
				-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
			
 
				-            break;
			
 
				-        default:
			
 
				-            // TODO: k-quants
			
 
				-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
			
 
				-            GGML_ASSERT(false);
			
 
				-            break;
			
 
				-    }
			
 
				-}
			
 
				+#include "getrows.cuh"

			
 
				+#include "dequantize.cuh"

			
 
				+

			
 
				+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>

			
 
				+static __global__ void k_get_rows(

			
 
				+            const void * src0, const int32_t * src1, dst_t * dst,

			
 
				+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/

			
 
				+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/

			
 
				+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,

			
 
				+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,

			
 
				+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

			
 
				+

			
 
				+    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;

			
 
				+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;

			
 
				+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

			
 
				+

			
 
				+    if (i00 >= ne00) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

			
 
				+

			
 
				+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;

			
 
				+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;

			
 
				+

			
 
				+    const int ib = i00/qk; // block index

			
 
				+    const int iqs = (i00%qk)/qr; // quant index

			
 
				+    const int iybs = i00 - i00%qk; // dst block start index

			
 
				+    const int y_offset = qr == 1 ? 1 : qk/2;

			
 
				+

			
 
				+    // dequantize

			
 
				+    dfloat2 v;

			
 
				+    dequantize_kernel(src0_row, ib, iqs, v);

			
 
				+

			
 
				+    dst_row[iybs + iqs + 0]        = v.x;

			
 
				+    dst_row[iybs + iqs + y_offset] = v.y;

			
 
				+}

			
 
				+

			
 
				+template<typename src0_t, typename dst_t>

			
 
				+static __global__ void k_get_rows_float(

			
 
				+            const src0_t * src0, const int32_t * src1, dst_t * dst,

			
 
				+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/

			
 
				+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/

			
 
				+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,

			
 
				+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,

			
 
				+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

			
 
				+

			
 
				+    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;

			
 
				+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;

			
 
				+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;

			
 
				+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

			
 
				+

			
 
				+    if (i00 >= ne00) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

			
 
				+

			
 
				+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;

			
 
				+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);

			
 
				+

			
 
				+    dst_row[i00] = src0_row[i00];

			
 
				+}

			
 
				+

			
 
				+template<int qk, int qr, dequantize_kernel_t dq>

			
 
				+static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

			
 
				+                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

			
 
				+

			
 
				+    GGML_TENSOR_BINARY_OP_LOCALS

			
 
				+

			
 
				+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);

			
 
				+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);

			
 
				+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

			
 
				+

			
 
				+    // strides in elements

			
 
				+    //const size_t s0 = nb0 / ggml_element_size(dst);

			
 
				+    const size_t s1 = nb1 / ggml_element_size(dst);

			
 
				+    const size_t s2 = nb2 / ggml_element_size(dst);

			
 
				+    const size_t s3 = nb3 / ggml_element_size(dst);

			
 
				+

			
 
				+    const size_t s10 = nb10 / ggml_element_size(src1);

			
 
				+    const size_t s11 = nb11 / ggml_element_size(src1);

			
 
				+    const size_t s12 = nb12 / ggml_element_size(src1);

			
 
				+    //const size_t s13 = nb13 / ggml_element_size(src1);

			
 
				+

			
 
				+    GGML_ASSERT(ne00 % 2 == 0);

			
 
				+

			
 
				+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(

			
 
				+            src0_dd, src1_dd, dst_dd,

			
 
				+            ne00, /*ne01, ne02, ne03,*/

			
 
				+            /*ne10, ne11,*/ ne12, /*ne13,*/

			
 
				+            /* s0,*/ s1, s2, s3,

			
 
				+            /* nb00,*/ nb01, nb02, nb03,

			
 
				+            s10, s11, s12/*, s13*/);

			
 
				+

			
 
				+    GGML_UNUSED(dst);

			
 
				+}

			
 
				+

			
 
				+template<typename src0_t>

			
 
				+static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

			
 
				+                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

			
 
				+

			
 
				+    GGML_TENSOR_BINARY_OP_LOCALS

			
 
				+

			
 
				+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);

			
 
				+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;

			
 
				+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

			
 
				+

			
 
				+    // strides in elements

			
 
				+    //const size_t s0 = nb0 / ggml_element_size(dst);

			
 
				+    const size_t s1 = nb1 / ggml_element_size(dst);

			
 
				+    const size_t s2 = nb2 / ggml_element_size(dst);

			
 
				+    const size_t s3 = nb3 / ggml_element_size(dst);

			
 
				+

			
 
				+    const size_t s10 = nb10 / ggml_element_size(src1);

			
 
				+    const size_t s11 = nb11 / ggml_element_size(src1);

			
 
				+    const size_t s12 = nb12 / ggml_element_size(src1);

			
 
				+    //const size_t s13 = nb13 / ggml_element_size(src1);

			
 
				+

			
 
				+    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(

			
 
				+            src0_dd, src1_dd, dst_dd,

			
 
				+            ne00, /*ne01, ne02, ne03,*/

			
 
				+            /*ne10, ne11,*/ ne12, /*ne13,*/

			
 
				+            /* s0,*/ s1, s2, s3,

			
 
				+            /* nb00,*/ nb01, nb02, nb03,

			
 
				+            s10, s11, s12/*, s13*/);

			
 
				+

			
 
				+    GGML_UNUSED(dst);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const ggml_tensor * src1 = dst->src[1];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    const float * src1_d = (const float *)src1->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+

			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_I32);

			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				+

			
 
				+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));

			
 
				+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));

			
 
				+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));

			
 
				+

			
 
				+    const int32_t * src1_i32 = (const int32_t *) src1_d;

			
 
				+

			
 
				+    switch (src0->type) {

			
 
				+        case GGML_TYPE_F16:

			
 
				+            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        case GGML_TYPE_F32:

			
 
				+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        case GGML_TYPE_Q4_0:

			
 
				+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        case GGML_TYPE_Q4_1:

			
 
				+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        case GGML_TYPE_Q5_0:

			
 
				+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        case GGML_TYPE_Q5_1:

			
 
				+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        case GGML_TYPE_Q8_0:

			
 
				+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

			
 
				+            break;

			
 
				+        default:

			
 
				+            // TODO: k-quants

			
 
				+            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));

			
 
				+            GGML_ASSERT(false);

			
 
				+            break;

			
 
				+    }

			
 
				+}

			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_GET_ROWS_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_GET_ROWS_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,104 +1,104 @@
 
				-#include "im2col.cuh"
			
 
				-
			
 
				-template <typename T>
			
 
				-static  __global__ void im2col_kernel(
			
 
				-        const float * x, T * dst, int64_t batch_offset,
			
 
				-        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
			
 
				-        int s0, int s1, int p0, int p1, int d0, int d1) {
			
 
				-    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				-    if (i >= pelements) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
			
 
				-    const int64_t  kx = i / ksize;
			
 
				-    const int64_t  kd = kx * ksize;
			
 
				-    const int64_t  ky = (i - kd) / OW;
			
 
				-    const int64_t  ix = i % OW;
			
 
				-
			
 
				-    const int64_t  oh = blockIdx.y;
			
 
				-    const int64_t  batch = blockIdx.z / IC;
			
 
				-    const int64_t  ic = blockIdx.z % IC;
			
 
				-
			
 
				-    const int64_t iiw = ix * s0 + kx * d0 - p0;
			
 
				-    const int64_t iih = oh * s1 + ky * d1 - p1;
			
 
				-
			
 
				-    const int64_t offset_dst =
			
 
				-        ((batch * OH + oh) * OW + ix) * CHW +
			
 
				-        (ic * (KW * KH) + ky * KW + kx);
			
 
				-
			
 
				-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
			
 
				-        dst[offset_dst] = 0.0f;
			
 
				-    } else {
			
 
				-        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
			
 
				-        dst[offset_dst] = x[offset_src + iih * IW + iiw];
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-template <typename T>
			
 
				-static void im2col_cuda(const float * x, T* dst,
			
 
				-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
			
 
				-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
			
 
				-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
			
 
				-    const int parallel_elements = OW * KW * KH;
			
 
				-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
			
 
				-    dim3 block_nums(num_blocks, OH, batch * IC);
			
 
				-    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
			
 
				-}
			
 
				-
			
 
				-static void im2col_cuda_f16(const float * x, half * dst,
			
 
				-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
			
 
				-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
			
 
				-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
			
 
				-
			
 
				-    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
			
 
				-}
			
 
				-
			
 
				-static void im2col_cuda_f32(const float * x, float * dst,
			
 
				-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
			
 
				-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
			
 
				-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
			
 
				-
			
 
				-    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const ggml_tensor * src1 = dst->src[1];
			
 
				-    const float * src1_d = (const float *)src1->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
			
 
				-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
			
 
				-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
			
 
				-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
			
 
				-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
			
 
				-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
			
 
				-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
			
 
				-
			
 
				-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
			
 
				-
			
 
				-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
			
 
				-    const int64_t IH = is_2D ? src1->ne[1] : 1;
			
 
				-    const int64_t IW =         src1->ne[0];
			
 
				-
			
 
				-    const int64_t KH = is_2D ? src0->ne[1] : 1;
			
 
				-    const int64_t KW =         src0->ne[0];
			
 
				-
			
 
				-    const int64_t OH = is_2D ? dst->ne[2] : 1;
			
 
				-    const int64_t OW =         dst->ne[1];
			
 
				-
			
 
				-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
			
 
				-    const int64_t batch = src1->ne[3];
			
 
				-    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
			
 
				-
			
 
				-    if(dst->type == GGML_TYPE_F16) {
			
 
				-        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
			
 
				-    } else {
			
 
				-        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
			
 
				-    }
			
 
				-}
			
 
				+#include "im2col.cuh"

			
 
				+

			
 
				+template <typename T>

			
 
				+static  __global__ void im2col_kernel(

			
 
				+        const float * x, T * dst, int64_t batch_offset,

			
 
				+        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,

			
 
				+        int s0, int s1, int p0, int p1, int d0, int d1) {

			
 
				+    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				+    if (i >= pelements) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int64_t  ksize = OW * (KH > 1 ? KW : 1);

			
 
				+    const int64_t  kx = i / ksize;

			
 
				+    const int64_t  kd = kx * ksize;

			
 
				+    const int64_t  ky = (i - kd) / OW;

			
 
				+    const int64_t  ix = i % OW;

			
 
				+

			
 
				+    const int64_t  oh = blockIdx.y;

			
 
				+    const int64_t  batch = blockIdx.z / IC;

			
 
				+    const int64_t  ic = blockIdx.z % IC;

			
 
				+

			
 
				+    const int64_t iiw = ix * s0 + kx * d0 - p0;

			
 
				+    const int64_t iih = oh * s1 + ky * d1 - p1;

			
 
				+

			
 
				+    const int64_t offset_dst =

			
 
				+        ((batch * OH + oh) * OW + ix) * CHW +

			
 
				+        (ic * (KW * KH) + ky * KW + kx);

			
 
				+

			
 
				+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {

			
 
				+        dst[offset_dst] = 0.0f;

			
 
				+    } else {

			
 
				+        const int64_t offset_src = ic * offset_delta + batch * batch_offset;

			
 
				+        dst[offset_dst] = x[offset_src + iih * IW + iiw];

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+template <typename T>

			
 
				+static void im2col_cuda(const float * x, T* dst,

			
 
				+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

			
 
				+    int64_t batch, int64_t batch_offset, int64_t offset_delta,

			
 
				+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

			
 
				+    const int parallel_elements = OW * KW * KH;

			
 
				+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;

			
 
				+    dim3 block_nums(num_blocks, OH, batch * IC);

			
 
				+    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);

			
 
				+}

			
 
				+

			
 
				+static void im2col_cuda_f16(const float * x, half * dst,

			
 
				+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

			
 
				+    int64_t batch, int64_t batch_offset, int64_t offset_delta,

			
 
				+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

			
 
				+

			
 
				+    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);

			
 
				+}

			
 
				+

			
 
				+static void im2col_cuda_f32(const float * x, float * dst,

			
 
				+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

			
 
				+    int64_t batch, int64_t batch_offset, int64_t offset_delta,

			
 
				+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

			
 
				+

			
 
				+    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const ggml_tensor * src1 = dst->src[1];

			
 
				+    const float * src1_d = (const float *)src1->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F16);

			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

			
 
				+

			
 
				+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];

			
 
				+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];

			
 
				+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];

			
 
				+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];

			
 
				+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];

			
 
				+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

			
 
				+

			
 
				+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

			
 
				+

			
 
				+    const int64_t IC = src1->ne[is_2D ? 2 : 1];

			
 
				+    const int64_t IH = is_2D ? src1->ne[1] : 1;

			
 
				+    const int64_t IW =         src1->ne[0];

			
 
				+

			
 
				+    const int64_t KH = is_2D ? src0->ne[1] : 1;

			
 
				+    const int64_t KW =         src0->ne[0];

			
 
				+

			
 
				+    const int64_t OH = is_2D ? dst->ne[2] : 1;

			
 
				+    const int64_t OW =         dst->ne[1];

			
 
				+

			
 
				+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32

			
 
				+    const int64_t batch = src1->ne[3];

			
 
				+    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32

			
 
				+

			
 
				+    if(dst->type == GGML_TYPE_F16) {

			
 
				+        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);

			
 
				+    } else {

			
 
				+        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);

			
 
				+    }

			
 
				+}

			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_IM2COL_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_IM2COL_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -11,6 +11,7 @@ void ggml_cuda_op_mul_mat_q(
 
				     const int64_t nb01 = src0->nb[1];
			
 
				 
			
 
				     const int64_t ne10 = src1->ne[0];
			
 
				+    const int64_t ne11 = src1->ne[1];
			
 
				     GGML_ASSERT(ne10 % QK8_1 == 0);
			
 
				 
			
 
				     const int64_t ne0 = dst->ne[0];
			
@@ -25,7 +26,7 @@ void ggml_cuda_op_mul_mat_q(
 
				     // nrows_dst == nrows of the matrix that the kernel writes into
			
 
				     const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
			
 
				 
			
 
				-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
			
 
				+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
			
 
				 
			
 
				     switch (src0->type) {
			
 
				         case GGML_TYPE_Q4_0:
			
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,15 +1,26 @@
 
				+#pragma once
			
 
				+
			
 
				 #include "common.cuh"
			
 
				 #include "vecdotq.cuh"
			
 
				 
			
 
				 #include <climits>
			
 
				 #include <cstdint>
			
 
				 
			
 
				+#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
			
 
				+
			
 
				 typedef void (*load_tiles_mmq_t)(
			
 
				     const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
			
 
				     int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
			
 
				 typedef void (*vec_dot_mmq_t)(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0);
			
 
				+
			
 
				+struct block_q8_1_mmq {
			
 
				+    half2  ds[4];
			
 
				+    int8_t qs[4*QK8_1];
			
 
				+};
			
 
				+static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
			
 
				+static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
			
 
				 
			
 
				 struct tile_x_sizes {
			
 
				     int ql;
			
@@ -132,10 +143,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
			
 
				 
			
 
				+    const float * x_dmf = (const float *) x_dm;
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const half2 * y_ds  = (const half2 *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -145,19 +160,18 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
 
				             const int i = i0 + threadIdx.x;
			
 
				 
			
 
				             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
			
 
				-            const float * x_dmf = (const float *) x_dm;
			
 
				 
			
 
				             int u[2*VDR_Q4_0_Q8_1_MMQ];
			
 
				 
			
 
				 #pragma unroll
			
 
				             for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
			
 
				-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
			
 
				-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
			
 
				+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
			
 
				+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_0) % WARP_SIZE];
			
 
				             }
			
 
				 
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
			
 
				-                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
			
 
				-                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				+                (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
			
 
				+                y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -203,10 +217,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
			
 
				 
			
 
				+    const int   * y_qs = (const int   *) y + 4;
			
 
				+    const half2 * y_ds = (const half2 *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -221,13 +238,13 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
 
				 
			
 
				 #pragma unroll
			
 
				             for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
			
 
				-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
			
 
				-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
			
 
				+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
			
 
				+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_1) % WARP_SIZE];
			
 
				             }
			
 
				 
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
			
 
				-                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
			
 
				-                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				+                (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
			
 
				+                y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -293,10 +310,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
			
 
				 
			
 
				+    const float * x_dmf = (const float *) x_dm;
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const float * y_df  = (const float *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -306,20 +327,18 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
 
				             const int i = i0 + threadIdx.x;
			
 
				 
			
 
				             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
			
 
				-            const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
			
 
				-            const float * x_dmf = (const float *) x_dm;
			
 
				-            const float * y_df  = (const float *) y_ds;
			
 
				+            const int index_bx = i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
			
 
				 
			
 
				             int u[2*VDR_Q5_0_Q8_1_MMQ];
			
 
				 
			
 
				 #pragma unroll
			
 
				             for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
			
 
				-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
			
 
				-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
			
 
				+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
			
 
				+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_0) % WARP_SIZE];
			
 
				             }
			
 
				 
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
			
 
				-                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				+                (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dmf[index_bx], y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -383,10 +402,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
			
 
				 
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const half2 * y_ds  = (const half2 *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -396,18 +418,18 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
 
				             const int i = i0 + threadIdx.x;
			
 
				 
			
 
				             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
			
 
				-            const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
			
 
				+            const int index_bx = i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1;
			
 
				 
			
 
				             int u[2*VDR_Q5_1_Q8_1_MMQ];
			
 
				 
			
 
				 #pragma unroll
			
 
				             for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
			
 
				-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
			
 
				-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
			
 
				+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
			
 
				+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_1) % WARP_SIZE];
			
 
				             }
			
 
				 
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
			
 
				-                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				+                (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dm[index_bx], y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -455,10 +477,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
			
 
				 
			
 
				+    const float * x_dmf = (const float *) x_dm;
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const float * y_df  = (const float *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -467,12 +493,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
 
				         for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
			
 
				             const int i = i0 + threadIdx.x;
			
 
				 
			
 
				-            const float * x_dmf = (const float *) x_dm;
			
 
				-            const float * y_df  = (const float *) y_ds;
			
 
				-
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
			
 
				-                (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
			
 
				-                y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
			
 
				+                (&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0], x_dmf[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
			
 
				+                y_df[j*MMQ_TILE_Y_K + k0/QI8_1]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -531,10 +554,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh);
			
 
				 
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const float * y_df  = (const float *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -545,11 +571,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
 
				 
			
 
				             const int kbx = k0 / QI2_K;
			
 
				             const int ky  = (k0 % QI2_K) * QR2_K;
			
 
				-            const float * y_df = (const float *) y_ds;
			
 
				 
			
 
				             int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
			
 
				 
			
 
				-            const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
			
 
				+            const int kqsx = i*(WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
			
 
				             const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
			
 
				 
			
 
				 #pragma unroll
			
@@ -557,11 +582,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
 
				                 v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
			
 
				             }
			
 
				 
			
 
				-            const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
			
 
				+            const uint8_t * scales = ((const uint8_t *) &x_sc[i*(WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
			
 
				 
			
 
				-            const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
			
 
				-                v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
			
 
				+                v, &y_qs[j*MMQ_TILE_Y_K + (QR2_K*k0) % WARP_SIZE], scales,
			
 
				+                x_dm[i*(WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[j*MMQ_TILE_Y_K + ((QR2_K*k0) % WARP_SIZE)/QI8_1]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -646,7 +671,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				+
			
 
				+    const float * x_dmf = (const float *) x_dm;
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const float * y_df  = (const float *) y;
			
 
				 
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
@@ -658,8 +687,6 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
 
				 
			
 
				             const int kbx  = k0 / QI3_K;
			
 
				             const int ky  = (k0 % QI3_K) * QR3_K;
			
 
				-            const float * x_dmf = (const float *) x_dm;
			
 
				-            const float * y_df  = (const float *) y_ds;
			
 
				 
			
 
				             const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
			
 
				 
			
@@ -667,19 +694,19 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
 
				 
			
 
				 #pragma unroll
			
 
				             for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
			
 
				-                const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
			
 
				+                const int kqsx = i*(WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
			
 
				                 const int shift = 2 * ((ky % 32) / 8);
			
 
				                 const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
			
 
				 
			
 
				-                const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
			
 
				+                const int vh = x_qh[i*(WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
			
 
				                 const int vlh = (vh << 2) & 0x04040404;
			
 
				 
			
 
				                 v[l] = __vsubss4(vll, vlh);
			
 
				             }
			
 
				 
			
 
				-            const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
			
 
				-                v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
			
 
				+                v, &y_qs[j*MMQ_TILE_Y_K + (k0*QR3_K) % WARP_SIZE], scales,
			
 
				+                x_dmf[i*(WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[j*MMQ_TILE_Y_K + ((k0*QR3_K) % WARP_SIZE)/QI8_1]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -746,10 +773,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh);
			
 
				 
			
 
				+    const int   * y_qs = (const int   *) y + 4;
			
 
				+    const half2 * y_ds = (const half2 *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -760,9 +790,9 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
 
				 
			
 
				             const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
			
 
				 
			
 
				-            const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
			
 
				-                &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
			
 
				+                &x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + (QR4_K*k0) % WARP_SIZE], sc, sc+8,
			
 
				+                x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + ((QR4_K*k0) % WARP_SIZE)/QI8_1]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -842,10 +872,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh);
			
 
				 
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const half2 * y_ds  = (const half2 *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -856,10 +889,9 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
 
				 
			
 
				             const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
			
 
				 
			
 
				-            const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k0;
			
 
				-            const int index_y = j * WARP_SIZE             + (QR5_K*k0) % WARP_SIZE;
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
			
 
				-                &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
			
 
				+                &x_ql[i*(QR5_K*WARP_SIZE + 1) + QR5_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR5_K*k0) % WARP_SIZE], sc, sc+8,
			
 
				+                x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + ((QR5_K*k0) % WARP_SIZE)/QI8_1]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -932,10 +964,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
				 template <int mmq_x, int mmq_y, int nwarps>
			
 
				 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
			
 
				     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
			
 
				-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
			
 
				+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
			
 
				 
			
 
				     GGML_UNUSED(x_qh);
			
 
				 
			
 
				+    const float * x_dmf = (const float *) x_dm;
			
 
				+    const int   * y_qs  = (const int   *) y + 4;
			
 
				+    const float * y_df  = (const float *) y;
			
 
				+
			
 
				 #pragma unroll
			
 
				     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
			
 
				         const int j = j0 + threadIdx.y;
			
@@ -944,15 +980,11 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
 
				         for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
			
 
				             const int i = i0 + threadIdx.x;
			
 
				 
			
 
				-            const float * x_dmf = (const float *) x_dm;
			
 
				-            const float * y_df  = (const float *) y_ds;
			
 
				-
			
 
				             const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
			
 
				 
			
 
				-            const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k0;
			
 
				-            const int index_y = j * WARP_SIZE             + (QR6_K*k0) % WARP_SIZE;
			
 
				             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
			
 
				-                &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
			
 
				+                &x_ql[i*(QR6_K*WARP_SIZE + 1) + QR6_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR6_K*k0) % WARP_SIZE], sc,
			
 
				+                x_dmf[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + ((QR6_K*k0) % WARP_SIZE)/QI8_1]);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -964,7 +996,6 @@ struct mmq_type_traits;
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
			
 
				-    static constexpr bool             need_sum   = true;
			
 
				     static constexpr int              vdr        = VDR_Q4_0_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -972,7 +1003,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
			
 
				-    static constexpr bool             need_sum   = true;
			
 
				     static constexpr int              vdr        = VDR_Q4_1_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -980,7 +1010,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
			
 
				-    static constexpr bool             need_sum   = false;
			
 
				     static constexpr int              vdr        = VDR_Q5_0_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -988,7 +1017,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
			
 
				-    static constexpr bool             need_sum   = true;
			
 
				     static constexpr int              vdr        = VDR_Q5_1_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -996,7 +1024,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
			
 
				-    static constexpr bool             need_sum   = false;
			
 
				     static constexpr int              vdr        = VDR_Q8_0_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -1004,7 +1031,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
			
 
				-    static constexpr bool             need_sum   = false;
			
 
				     static constexpr int              vdr        = VDR_Q2_K_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -1012,7 +1038,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
			
 
				-    static constexpr bool             need_sum   = false;
			
 
				     static constexpr int              vdr        = VDR_Q3_K_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -1020,7 +1045,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
			
 
				-    static constexpr bool             need_sum   = true;
			
 
				     static constexpr int              vdr        = VDR_Q4_K_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -1028,7 +1052,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
			
 
				-    static constexpr bool             need_sum   = true;
			
 
				     static constexpr int              vdr        = VDR_Q5_K_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
@@ -1036,12 +1059,36 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
 
				 
			
 
				 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
			
 
				 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
			
 
				-    static constexpr bool             need_sum   = false;
			
 
				     static constexpr int              vdr        = VDR_Q6_K_Q8_1_MMQ;
			
 
				     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
			
 
				     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
			
 
				 };
			
 
				 
			
 
				+static int mmq_need_sum(const ggml_type type_x) {
			
 
				+    switch (type_x) {
			
 
				+        case GGML_TYPE_Q4_0:
			
 
				+        case GGML_TYPE_Q4_1:
			
 
				+            return true;
			
 
				+        case GGML_TYPE_Q5_0:
			
 
				+            return false;
			
 
				+        case GGML_TYPE_Q5_1:
			
 
				+            return true;
			
 
				+        case GGML_TYPE_Q8_0:
			
 
				+        case GGML_TYPE_Q2_K:
			
 
				+        case GGML_TYPE_Q3_K:
			
 
				+            return false;
			
 
				+        case GGML_TYPE_Q4_K:
			
 
				+        case GGML_TYPE_Q5_K:
			
 
				+            return true;
			
 
				+        case GGML_TYPE_Q6_K:
			
 
				+            return false;
			
 
				+        default:
			
 
				+            GGML_ASSERT(false);
			
 
				+            break;
			
 
				+    }
			
 
				+    return false;
			
 
				+}
			
 
				+
			
 
				 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
			
 
				 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
			
 
				 #if defined(RDNA3) || defined(RDNA2)
			
@@ -1056,7 +1103,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 
				 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
			
 
				 static __global__ void mul_mat_q(
			
 
				     const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
			
 
				-    const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
			
 
				+    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
			
 
				 
			
 
				     // Skip unused template specializations for faster compilation:
			
 
				     if (mmq_x > get_mmq_x_max_device()) {
			
@@ -1068,7 +1115,6 @@ static __global__ void mul_mat_q(
 
				     constexpr int              qr         = ggml_cuda_type_traits<type>::qr;
			
 
				     constexpr int              qi         = ggml_cuda_type_traits<type>::qi;
			
 
				     constexpr int              mmq_y      = get_mmq_y_device(mmq_x);
			
 
				-    constexpr bool             need_sum   = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
			
 
				     constexpr int              vdr        = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
			
 
				     constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
			
 
				     constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
			
@@ -1080,62 +1126,38 @@ static __global__ void mul_mat_q(
 
				     half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
			
 
				     int   * tile_x_qh = (int   *) (tile_x_dm + txs.dm);
			
 
				     int   * tile_x_sc = (int   *) (tile_x_qh + txs.qh);
			
 
				-    int   * tile_y_qs = (int   *) (tile_x_sc + txs.sc);          // [mmq_x * WARP_SIZE]
			
 
				-    half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
			
 
				-
			
 
				-    const block_q8_1 * y = (const block_q8_1 *) yc;
			
 
				+    int   * tile_y    = (int   *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)]
			
 
				 
			
 
				     const int blocks_per_row_x = ne00 / qk;
			
 
				-    const int blocks_per_col_y = ne10 / QK8_1;
			
 
				     const int blocks_per_warp = WARP_SIZE / qi;
			
 
				 
			
 
				     const int & ne1 = ne11;
			
 
				 
			
 
				     const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
			
 
				 
			
 
				+    const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
			
 
				+
			
 
				     float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
			
 
				 
			
 
				     for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
			
 
				 
			
 
				-        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
			
 
				+        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01);
			
 
				 
			
 
				 #pragma unroll
			
 
				         for (int kr = 0; kr < qr; ++kr) {
			
 
				-            const int kqs = kr*WARP_SIZE + threadIdx.x;
			
 
				-            const int kbxd = kqs / QI8_1;
			
 
				-
			
 
				+            const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + kr*sizeof(block_q8_1_mmq)/sizeof(int));
			
 
				 #pragma unroll
			
 
				-            for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
			
 
				-                const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
			
 
				-
			
 
				-                const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
			
 
				+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
			
 
				+                int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
			
 
				 
			
 
				-                const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
			
 
				-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
			
 
				-            }
			
 
				-
			
 
				-#pragma unroll
			
 
				-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
			
 
				-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
			
 
				-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
			
 
				-                const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
			
 
				-
			
 
				-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
			
 
				-                const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
			
 
				-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
			
 
				-                if (need_sum) {
			
 
				-                    *dsi_dst = *dsi_src;
			
 
				-                } else {
			
 
				-                    float * dfi_dst = (float *) dsi_dst;
			
 
				-                    *dfi_dst = __low2float(*dsi_src);
			
 
				-                }
			
 
				+                tile_y[l] = by0[l];
			
 
				             }
			
 
				 
			
 
				             __syncthreads();
			
 
				 
			
 
				 // #pragma unroll // unrolling this loop causes too much register pressure
			
 
				             for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
			
 
				-                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
			
 
				+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y, sum, k0);
			
 
				             }
			
 
				 
			
 
				             __syncthreads();
			
@@ -1165,8 +1187,8 @@ static __global__ void mul_mat_q(
 
				 
			
 
				 struct mmq_args {
			
 
				     const char * x; const char * y; float * dst;
			
 
				-    int64_t ne00; int64_t ne01; int64_t stride00;
			
 
				-    int64_t ne10; int64_t ne11;
			
 
				+    int64_t ne00; int64_t ne01; int64_t stride01;
			
 
				+    int64_t ne10; int64_t ne11; int64_t stride11;
			
 
				     int64_t ne0;
			
 
				 };
			
 
				 
			
@@ -1184,7 +1206,7 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
 
				     const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
			
 
				     const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
			
 
				     const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
			
 
				-    const int shmem = shmem_x + shmem_y;
			
 
				+    const int shmem = shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int));
			
 
				 
			
 
				 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				     static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
			
@@ -1198,11 +1220,11 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
 
				     if (args.ne01 % mmq_y == 0) {
			
 
				         const bool need_check = false;
			
 
				         mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
			
 
				-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
			
 
				+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
			
 
				     } else {
			
 
				         const bool need_check = true;
			
 
				         mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
			
 
				-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
			
 
				+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
			
 
				     }
			
 
				 }
			
 
				 
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,7 +1,7 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_mul_mat_vec_q(
			
 
				-    ggml_backend_cuda_context & ctx,
			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+void ggml_cuda_op_mul_mat_vec_q(

			
 
				+    ggml_backend_cuda_context & ctx,

			
 
				+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

			
 
				+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

			
 
				+    const int64_t src1_padded_row_size, cudaStream_t stream);

			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,7 +1,7 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+

			
 
				+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
 
				+

			
 
				+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,49 +1,49 @@
 
				-#include "pad.cuh"
			
 
				-
			
 
				-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
			
 
				-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
			
 
				-    // blockIdx.y: idx of ne1
			
 
				-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
			
 
				-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				-    if (nidx >= ne0) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    // operation
			
 
				-    int offset_dst =
			
 
				-        nidx +
			
 
				-        blockIdx.y * ne0 +
			
 
				-        blockIdx.z * ne0 * gridDim.y;
			
 
				-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
			
 
				-        int offset_src =
			
 
				-            nidx +
			
 
				-            blockIdx.y * ne00 +
			
 
				-            blockIdx.z * ne00 * ne01;
			
 
				-        dst[offset_dst] = x[offset_src];
			
 
				-    } else {
			
 
				-        dst[offset_dst] = 0.0f;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static void pad_f32_cuda(const float * x, float * dst,
			
 
				-    const int ne00, const int ne01, const int ne02, const int ne03,
			
 
				-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
			
 
				-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
			
 
				-    dim3 gridDim(num_blocks, ne1, ne2*ne3);
			
 
				-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
			
 
				-
			
 
				-    pad_f32_cuda(src0_d, dst_d,
			
 
				-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
			
 
				-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
			
 
				-}
			
 
				+#include "pad.cuh"

			
 
				+

			
 
				+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {

			
 
				+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03

			
 
				+    // blockIdx.y: idx of ne1

			
 
				+    // blockIDx.x: idx of ne0 / BLOCK_SIZE

			
 
				+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				+    if (nidx >= ne0) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    // operation

			
 
				+    int offset_dst =

			
 
				+        nidx +

			
 
				+        blockIdx.y * ne0 +

			
 
				+        blockIdx.z * ne0 * gridDim.y;

			
 
				+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {

			
 
				+        int offset_src =

			
 
				+            nidx +

			
 
				+            blockIdx.y * ne00 +

			
 
				+            blockIdx.z * ne00 * ne01;

			
 
				+        dst[offset_dst] = x[offset_src];

			
 
				+    } else {

			
 
				+        dst[offset_dst] = 0.0f;

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+static void pad_f32_cuda(const float * x, float * dst,

			
 
				+    const int ne00, const int ne01, const int ne02, const int ne03,

			
 
				+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {

			
 
				+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;

			
 
				+    dim3 gridDim(num_blocks, ne1, ne2*ne3);

			
 
				+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

			
 
				+

			
 
				+    pad_f32_cuda(src0_d, dst_d,

			
 
				+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],

			
 
				+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);

			
 
				+}

			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_PAD_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_PAD_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,94 +1,94 @@
 
				-#include "pool2d.cuh"
			
 
				-
			
 
				-template <typename Ti, typename To>
			
 
				-static  __global__ void pool2d_nchw_kernel(
			
 
				-        const int ih, const int iw, const int oh, const int ow,
			
 
				-        const int kh, const int kw, const int sh, const int sw,
			
 
				-        const int ph, const int pw, const int parallel_elements,
			
 
				-        const Ti* src, To* dst, const enum ggml_op_pool op) {
			
 
				-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				-    if (idx >= parallel_elements) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    const int I_HW = ih * iw;
			
 
				-    const int O_HW = oh * ow;
			
 
				-    const int nc = idx / O_HW;
			
 
				-    const int cur_oh = idx % O_HW / ow;
			
 
				-    const int cur_ow = idx % O_HW % ow;
			
 
				-    const Ti* i_ptr = src + nc * I_HW;
			
 
				-    To* o_ptr = dst + nc * O_HW;
			
 
				-    const int start_h = cur_oh * sh - ph;
			
 
				-    const int bh = max(0, start_h);
			
 
				-    const int eh = min(ih, start_h + kh);
			
 
				-    const int start_w = cur_ow * sw - pw;
			
 
				-    const int bw = max(0, start_w);
			
 
				-    const int ew = min(iw, start_w + kw);
			
 
				-    const To scale = 1. / (kh * kw);
			
 
				-    To res = 0;
			
 
				-
			
 
				-    switch (op) {
			
 
				-        case GGML_OP_POOL_AVG: res = 0; break;
			
 
				-        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
			
 
				-        default: assert(false);
			
 
				-    }
			
 
				-
			
 
				-    for (int i = bh; i < eh; i += 1) {
			
 
				-        for (int j = bw; j < ew; j += 1) {
			
 
				-#if __CUDA_ARCH__ >= 350
			
 
				-            Ti cur = __ldg(i_ptr + i * iw + j);
			
 
				-#else
			
 
				-            Ti cur = i_ptr[i * iw + j];
			
 
				-#endif
			
 
				-            switch (op) {
			
 
				-                case GGML_OP_POOL_AVG: res += cur * scale; break;
			
 
				-                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
			
 
				-                default: assert(false);
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-    o_ptr[cur_oh * ow + cur_ow] = res;
			
 
				-}
			
 
				-
			
 
				-static void pool2d_nchw_kernel_f32_f32_cuda(
			
 
				-        const int ih, const int iw, const int oh, const int ow,
			
 
				-        const int kh, const int kw, const int sh, const int sw,
			
 
				-        const int ph, const int pw, const int parallel_elements,
			
 
				-        const float * src, float * dst, const enum ggml_op_pool op,
			
 
				-        cudaStream_t stream) {
			
 
				-
			
 
				-    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
			
 
				-    dim3 block_nums(num_blocks);
			
 
				-    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    const int32_t * opts = (const int32_t *)dst->op_params;
			
 
				-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
			
 
				-    const int k0 = opts[1];
			
 
				-    const int k1 = opts[2];
			
 
				-    const int s0 = opts[3];
			
 
				-    const int s1 = opts[4];
			
 
				-    const int p0 = opts[5];
			
 
				-    const int p1 = opts[6];
			
 
				-
			
 
				-    const int64_t IH = src0->ne[1];
			
 
				-    const int64_t IW = src0->ne[0];
			
 
				-
			
 
				-    const int64_t N = dst->ne[3];
			
 
				-    const int64_t OC = dst->ne[2];
			
 
				-    const int64_t OH = dst->ne[1];
			
 
				-    const int64_t OW = dst->ne[0];
			
 
				-
			
 
				-    const int parallel_elements = N * OC * OH * OW;
			
 
				-
			
 
				-    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
			
 
				-}
			
 
				+#include "pool2d.cuh"

			
 
				+

			
 
				+template <typename Ti, typename To>

			
 
				+static  __global__ void pool2d_nchw_kernel(

			
 
				+        const int ih, const int iw, const int oh, const int ow,

			
 
				+        const int kh, const int kw, const int sh, const int sw,

			
 
				+        const int ph, const int pw, const int parallel_elements,

			
 
				+        const Ti* src, To* dst, const enum ggml_op_pool op) {

			
 
				+    int idx = threadIdx.x + blockIdx.x * blockDim.x;

			
 
				+    if (idx >= parallel_elements) {

			
 
				+        return;

			
 
				+    }

			
 
				+

			
 
				+    const int I_HW = ih * iw;

			
 
				+    const int O_HW = oh * ow;

			
 
				+    const int nc = idx / O_HW;

			
 
				+    const int cur_oh = idx % O_HW / ow;

			
 
				+    const int cur_ow = idx % O_HW % ow;

			
 
				+    const Ti* i_ptr = src + nc * I_HW;

			
 
				+    To* o_ptr = dst + nc * O_HW;

			
 
				+    const int start_h = cur_oh * sh - ph;

			
 
				+    const int bh = max(0, start_h);

			
 
				+    const int eh = min(ih, start_h + kh);

			
 
				+    const int start_w = cur_ow * sw - pw;

			
 
				+    const int bw = max(0, start_w);

			
 
				+    const int ew = min(iw, start_w + kw);

			
 
				+    const To scale = 1. / (kh * kw);

			
 
				+    To res = 0;

			
 
				+

			
 
				+    switch (op) {

			
 
				+        case GGML_OP_POOL_AVG: res = 0; break;

			
 
				+        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;

			
 
				+        default: assert(false);

			
 
				+    }

			
 
				+

			
 
				+    for (int i = bh; i < eh; i += 1) {

			
 
				+        for (int j = bw; j < ew; j += 1) {

			
 
				+#if __CUDA_ARCH__ >= 350

			
 
				+            Ti cur = __ldg(i_ptr + i * iw + j);

			
 
				+#else

			
 
				+            Ti cur = i_ptr[i * iw + j];

			
 
				+#endif

			
 
				+            switch (op) {

			
 
				+                case GGML_OP_POOL_AVG: res += cur * scale; break;

			
 
				+                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;

			
 
				+                default: assert(false);

			
 
				+            }

			
 
				+        }

			
 
				+    }

			
 
				+    o_ptr[cur_oh * ow + cur_ow] = res;

			
 
				+}

			
 
				+

			
 
				+static void pool2d_nchw_kernel_f32_f32_cuda(

			
 
				+        const int ih, const int iw, const int oh, const int ow,

			
 
				+        const int kh, const int kw, const int sh, const int sw,

			
 
				+        const int ph, const int pw, const int parallel_elements,

			
 
				+        const float * src, float * dst, const enum ggml_op_pool op,

			
 
				+        cudaStream_t stream) {

			
 
				+

			
 
				+    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;

			
 
				+    dim3 block_nums(num_blocks);

			
 
				+    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				+

			
 
				+    const int32_t * opts = (const int32_t *)dst->op_params;

			
 
				+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);

			
 
				+    const int k0 = opts[1];

			
 
				+    const int k1 = opts[2];

			
 
				+    const int s0 = opts[3];

			
 
				+    const int s1 = opts[4];

			
 
				+    const int p0 = opts[5];

			
 
				+    const int p1 = opts[6];

			
 
				+

			
 
				+    const int64_t IH = src0->ne[1];

			
 
				+    const int64_t IW = src0->ne[0];

			
 
				+

			
 
				+    const int64_t N = dst->ne[3];

			
 
				+    const int64_t OC = dst->ne[2];

			
 
				+    const int64_t OH = dst->ne[1];

			
 
				+    const int64_t OW = dst->ne[0];

			
 
				+

			
 
				+    const int parallel_elements = N * OC * OH * OW;

			
 
				+

			
 
				+    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);

			
 
				+}

			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_POOL2D_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_POOL2D_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,22 +1,23 @@
 
				 #include "quantize.cuh"
			
 
				+#include <cstdint>
			
 
				 
			
 
				-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
			
 
				-    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
			
 
				+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
			
 
				 
			
 
				-    if (ix >= kx_padded) {
			
 
				+    if (ix0 >= kx0_padded) {
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				-    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
			
 
				+    const int64_t ix1 = blockIdx.y;
			
 
				 
			
 
				-    const int64_t i_padded = (int64_t)iy*kx_padded + ix;
			
 
				+    const int64_t i_padded = ix1*kx0_padded + ix0;
			
 
				 
			
 
				     block_q8_1 * y = (block_q8_1 *) vy;
			
 
				 
			
 
				     const int64_t ib = i_padded / QK8_1; // block index
			
 
				     const int64_t iqs = i_padded % QK8_1; // quant index
			
 
				 
			
 
				-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
			
 
				+    const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
			
 
				     float amax = fabsf(xi);
			
 
				     float sum = xi;
			
 
				 
			
@@ -36,10 +37,76 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
 
				     reinterpret_cast<half&>(y[ib].ds.y) = sum;
			
 
				 }
			
 
				 
			
 
				-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
			
 
				-    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
			
 
				-    const dim3 num_blocks(block_num_x, ky, 1);
			
 
				+template <bool need_sum>
			
 
				+static __global__ void quantize_mmq_q8_1(
			
 
				+    const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
			
 
				+
			
 
				+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
			
 
				+
			
 
				+    if (ix0 >= kx0_padded) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
			
 
				+
			
 
				+    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
			
 
				+
			
 
				+    const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
			
 
				+    const int64_t ib  = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y;              // block index in channel
			
 
				+    const int64_t iqs = ix0 % (4*QK8_1);                                       // quant index in block
			
 
				+
			
 
				+    const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
			
 
				+    float amax = fabsf(xi);
			
 
				+
			
 
				+    amax = warp_reduce_max(amax);
			
 
				+
			
 
				+    float sum;
			
 
				+    if (need_sum) {
			
 
				+        sum = warp_reduce_sum(xi);
			
 
				+    }
			
 
				+
			
 
				+    const float d = amax / 127;
			
 
				+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
			
 
				+
			
 
				+    y[ib].qs[iqs] = q;
			
 
				+
			
 
				+    if (iqs % QK8_1 != 0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    if (need_sum) {
			
 
				+        y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
			
 
				+    } else {
			
 
				+        ((float *) y[ib].ds)[iqs/QK8_1] = d;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void quantize_row_q8_1_cuda(
			
 
				+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
			
 
				+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
			
 
				+
			
 
				+    GGML_ASSERT(kx0_padded % QK8_1 == 0);
			
 
				+
			
 
				+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
			
 
				+    const dim3 num_blocks(block_num_x, kx1*channels, 1);
			
 
				     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
			
 
				-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
			
 
				+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
			
 
				+
			
 
				+    GGML_UNUSED(type_x);
			
 
				 }
			
 
				 
			
 
				+void quantize_mmq_q8_1_cuda(
			
 
				+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
			
 
				+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
			
 
				+
			
 
				+    GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
			
 
				+
			
 
				+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
			
 
				+    const dim3 num_blocks(block_num_x, kx1, channels);
			
 
				+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
			
 
				+    if (mmq_need_sum(type_x)) {
			
 
				+        quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
			
 
				+    } else {
			
 
				+        quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
			
 
				+    }
			
 
				+}
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,20 @@
 
				+#pragma once
			
 
				+
			
 
				 #include "common.cuh"
			
 
				+#include "mmq.cuh"
			
 
				+
			
 
				+#include <cstdint>
			
 
				 
			
 
				 #define CUDA_QUANTIZE_BLOCK_SIZE 256
			
 
				 
			
 
				-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
			
 
				+typedef void (*quantize_cuda_t)(
			
 
				+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
			
 
				+    const ggml_type type_x, cudaStream_t stream);
			
 
				+
			
 
				+void quantize_row_q8_1_cuda(
			
 
				+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
			
 
				+    const ggml_type type_x, cudaStream_t stream);
			
 
				+
			
 
				+void quantize_mmq_q8_1_cuda(
			
 
				+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
			
 
				+    const ggml_type type_x, cudaStream_t stream);
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_ROPE_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_ROPE_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_SCALE_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_SCALE_BLOCK_SIZE 256

			
 
				+

			
 
				+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
			
 
				-
			
 
				-void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024

			
 
				+

			
 
				+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,40 +1,40 @@
 
				-#include "sumrows.cuh"
			
 
				-
			
 
				-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
			
 
				-    const int row = blockIdx.x;
			
 
				-    const int col = threadIdx.x;
			
 
				-
			
 
				-    float sum = 0.0f;
			
 
				-    for (int i = col; i < ncols; i += blockDim.x) {
			
 
				-        sum += x[row * ncols + i];
			
 
				-    }
			
 
				-
			
 
				-    sum = warp_reduce_sum(sum);
			
 
				-
			
 
				-    if (col == 0) {
			
 
				-        dst[row] = sum;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				-    const dim3 block_nums(nrows, 1, 1);
			
 
				-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				-
			
 
				-
			
 
				-    const int64_t ncols = src0->ne[0];
			
 
				-    const int64_t nrows = ggml_nrows(src0);
			
 
				-
			
 
				-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
			
 
				-}
			
 
				+#include "sumrows.cuh"

			
 
				+

			
 
				+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {

			
 
				+    const int row = blockIdx.x;

			
 
				+    const int col = threadIdx.x;

			
 
				+

			
 
				+    float sum = 0.0f;

			
 
				+    for (int i = col; i < ncols; i += blockDim.x) {

			
 
				+        sum += x[row * ncols + i];

			
 
				+    }

			
 
				+

			
 
				+    sum = warp_reduce_sum(sum);

			
 
				+

			
 
				+    if (col == 0) {

			
 
				+        dst[row] = sum;

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

			
 
				+    const dim3 block_dims(WARP_SIZE, 1, 1);

			
 
				+    const dim3 block_nums(nrows, 1, 1);

			
 
				+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);

			
 
				+}

			
 
				+

			
 
				+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

			
 
				+    const ggml_tensor * src0 = dst->src[0];

			
 
				+    const float * src0_d = (const float *)src0->data;

			
 
				+    float * dst_d = (float *)dst->data;

			
 
				+    cudaStream_t stream = ctx.stream();

			
 
				+

			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));

			
 
				+

			
 
				+

			
 
				+    const int64_t ncols = src0->ne[0];

			
 
				+    const int64_t nrows = ggml_nrows(src0);

			
 
				+

			
 
				+    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);

			
 
				+}

			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,3 +1,3 @@
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+#include "common.cuh"

			
 
				+

			
 
				+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f16.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f32.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f32.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f32.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
 
				+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				+
			
 
				+#include "../fattn-vec-f32.cuh"
			
 
				+
			
 
				+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);