10 місяців тому · ba0ff1c46a
--- a/llama/.gitignore
+++ b/llama/.gitignore
@@ -3,3 +3,4 @@
 
															 *.lib
														
 
															 *.exp
														
 
															 *.dll
														
 
															+*.o
														
--- a/llama/base64.hpp
+++ b/llama/base64.hpp
@@ -1,392 +1,392 @@
 
															-/*
														
 
															-This is free and unencumbered software released into the public domain.
														
 
															-
														
 
															-Anyone is free to copy, modify, publish, use, compile, sell, or
														
 
															-distribute this software, either in source code form or as a compiled
														
 
															-binary, for any purpose, commercial or non-commercial, and by any
														
 
															-means.
														
 
															-
														
 
															-In jurisdictions that recognize copyright laws, the author or authors
														
 
															-of this software dedicate any and all copyright interest in the
														
 
															-software to the public domain. We make this dedication for the benefit
														
 
															-of the public at large and to the detriment of our heirs and
														
 
															-successors. We intend this dedication to be an overt act of
														
 
															-relinquishment in perpetuity of all present and future rights to this
														
 
															-software under copyright law.
														
 
															-
														
 
															-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
														
 
															-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
														
 
															-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
														
 
															-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
														
 
															-OTHER DEALINGS IN THE SOFTWARE.
														
 
															-
														
 
															-For more information, please refer to <http://unlicense.org>
														
 
															-*/
														
 
															-
														
 
															-#ifndef PUBLIC_DOMAIN_BASE64_HPP_
														
 
															-#define PUBLIC_DOMAIN_BASE64_HPP_
														
 
															-
														
 
															-#include <cstdint>
														
 
															-#include <iterator>
														
 
															-#include <stdexcept>
														
 
															-#include <string>
														
 
															-
														
 
															-class base64_error : public std::runtime_error
														
 
															-{
														
 
															-public:
														
 
															-    using std::runtime_error::runtime_error;
														
 
															-};
														
 
															-
														
 
															-class base64
														
 
															-{
														
 
															-public:
														
 
															-    enum class alphabet
														
 
															-    {
														
 
															-        /** the alphabet is detected automatically */
														
 
															-        auto_,
														
 
															-        /** the standard base64 alphabet is used */
														
 
															-        standard,
														
 
															-        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
														
 
															-        url_filename_safe
														
 
															-    };
														
 
															-
														
 
															-    enum class decoding_behavior
														
 
															-    {
														
 
															-        /** if the input is not padded, the remaining bits are ignored */
														
 
															-        moderate,
														
 
															-        /** if a padding character is encounter decoding is finished */
														
 
															-        loose
														
 
															-    };
														
 
															-
														
 
															-    /**
														
 
															-     Encodes all the elements from `in_begin` to `in_end` to `out`.
														
 
															-
														
 
															-     @warning The source and destination cannot overlap. The destination must be able to hold at least
														
 
															-     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
														
 
															-
														
 
															-     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
														
 
															-     8 bits
														
 
															-     @tparam Output_iterator the destination; the elements written to it are from the type `char`
														
 
															-     @param in_begin the beginning of the source
														
 
															-     @param in_end the ending of the source
														
 
															-     @param out the destination iterator
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @returns the iterator to the next element past the last element copied
														
 
															-     @throws see `Input_iterator` and `Output_iterator`
														
 
															-    */
														
 
															-    template<typename Input_iterator, typename Output_iterator>
														
 
															-    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
														
 
															-                                  alphabet alphabet = alphabet::standard)
														
 
															-    {
														
 
															-        constexpr auto pad = '=';
														
 
															-        const char* alpha  = alphabet == alphabet::url_filename_safe
														
 
															-                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
														
 
															-                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
														
 
															-
														
 
															-        while (in_begin != in_end) {
														
 
															-            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
														
 
															-
														
 
															-            // first character
														
 
															-            i0 = static_cast<std::uint8_t>(*in_begin);
														
 
															-            ++in_begin;
														
 
															-
														
 
															-            *out = alpha[i0 >> 2 & 0x3f];
														
 
															-            ++out;
														
 
															-
														
 
															-            // part of first character and second
														
 
															-            if (in_begin != in_end) {
														
 
															-                i1 = static_cast<std::uint8_t>(*in_begin);
														
 
															-                ++in_begin;
														
 
															-
														
 
															-                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
														
 
															-                ++out;
														
 
															-            } else {
														
 
															-                *out = alpha[(i0 & 0x3) << 4];
														
 
															-                ++out;
														
 
															-
														
 
															-                // last padding
														
 
															-                *out = pad;
														
 
															-                ++out;
														
 
															-
														
 
															-                // last padding
														
 
															-                *out = pad;
														
 
															-                ++out;
														
 
															-
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            // part of second character and third
														
 
															-            if (in_begin != in_end) {
														
 
															-                i2 = static_cast<std::uint8_t>(*in_begin);
														
 
															-                ++in_begin;
														
 
															-
														
 
															-                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
														
 
															-                ++out;
														
 
															-            } else {
														
 
															-                *out = alpha[(i1 & 0xf) << 2];
														
 
															-                ++out;
														
 
															-
														
 
															-                // last padding
														
 
															-                *out = pad;
														
 
															-                ++out;
														
 
															-
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            // rest of third
														
 
															-            *out = alpha[i2 & 0x3f];
														
 
															-            ++out;
														
 
															-        }
														
 
															-
														
 
															-        return out;
														
 
															-    }
														
 
															-    /**
														
 
															-     Encodes a string.
														
 
															-
														
 
															-     @param str the string that should be encoded
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @returns the encoded base64 string
														
 
															-     @throws see base64::encode()
														
 
															-    */
														
 
															-    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
														
 
															-    {
														
 
															-        std::string result;
														
 
															-
														
 
															-        result.reserve(required_encode_size(str.length()) + 1);
														
 
															-
														
 
															-        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
														
 
															-
														
 
															-        return result;
														
 
															-    }
														
 
															-    /**
														
 
															-     Encodes a char array.
														
 
															-
														
 
															-     @param buffer the char array
														
 
															-     @param size the size of the array
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @returns the encoded string
														
 
															-    */
														
 
															-    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
														
 
															-    {
														
 
															-        std::string result;
														
 
															-
														
 
															-        result.reserve(required_encode_size(size) + 1);
														
 
															-
														
 
															-        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
														
 
															-
														
 
															-        return result;
														
 
															-    }
														
 
															-    /**
														
 
															-     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
														
 
															-     in other words: inplace decoding is possible.
														
 
															-
														
 
															-     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
														
 
															-     otherwise the behavior depends on the output iterator.
														
 
															-
														
 
															-     @tparam Input_iterator the source; the returned elements are cast to `char`
														
 
															-     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
														
 
															-     @param in_begin the beginning of the source
														
 
															-     @param in_end the ending of the source
														
 
															-     @param out the destination iterator
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @param behavior the behavior when an error was detected
														
 
															-     @returns the iterator to the next element past the last element copied
														
 
															-     @throws base64_error depending on the set behavior
														
 
															-     @throws see `Input_iterator` and `Output_iterator`
														
 
															-    */
														
 
															-    template<typename Input_iterator, typename Output_iterator>
														
 
															-    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
														
 
															-                                  alphabet alphabet          = alphabet::auto_,
														
 
															-                                  decoding_behavior behavior = decoding_behavior::moderate)
														
 
															-    {
														
 
															-        //constexpr auto pad = '=';
														
 
															-        std::uint8_t last  = 0;
														
 
															-        auto bits          = 0;
														
 
															-
														
 
															-        while (in_begin != in_end) {
														
 
															-            auto c = *in_begin;
														
 
															-            ++in_begin;
														
 
															-
														
 
															-            if (c == '=') {
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            auto part = _base64_value(alphabet, c);
														
 
															-
														
 
															-            // enough bits for one byte
														
 
															-            if (bits + 6 >= 8) {
														
 
															-                *out = (last << (8 - bits)) | (part >> (bits - 2));
														
 
															-                ++out;
														
 
															-
														
 
															-                bits -= 2;
														
 
															-            } else {
														
 
															-                bits += 6;
														
 
															-            }
														
 
															-
														
 
															-            last = part;
														
 
															-        }
														
 
															-
														
 
															-        // check padding
														
 
															-        if (behavior != decoding_behavior::loose) {
														
 
															-            while (in_begin != in_end) {
														
 
															-                auto c = *in_begin;
														
 
															-                ++in_begin;
														
 
															-
														
 
															-                if (c != '=') {
														
 
															-                    throw base64_error("invalid base64 character.");
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        return out;
														
 
															-    }
														
 
															-    /**
														
 
															-     Decodes a string.
														
 
															-
														
 
															-     @param str the base64 encoded string
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @param behavior the behavior when an error was detected
														
 
															-     @returns the decoded string
														
 
															-     @throws see base64::decode()
														
 
															-    */
														
 
															-    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
														
 
															-                              decoding_behavior behavior = decoding_behavior::moderate)
														
 
															-    {
														
 
															-        std::string result;
														
 
															-
														
 
															-        result.reserve(max_decode_size(str.length()));
														
 
															-
														
 
															-        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
														
 
															-
														
 
															-        return result;
														
 
															-    }
														
 
															-    /**
														
 
															-     Decodes a string.
														
 
															-
														
 
															-     @param buffer the base64 encoded buffer
														
 
															-     @param size the size of the buffer
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @param behavior the behavior when an error was detected
														
 
															-     @returns the decoded string
														
 
															-     @throws see base64::decode()
														
 
															-    */
														
 
															-    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
														
 
															-                              decoding_behavior behavior = decoding_behavior::moderate)
														
 
															-    {
														
 
															-        std::string result;
														
 
															-
														
 
															-        result.reserve(max_decode_size(size));
														
 
															-
														
 
															-        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
														
 
															-
														
 
															-        return result;
														
 
															-    }
														
 
															-    /**
														
 
															-     Decodes a string inplace.
														
 
															-
														
 
															-     @param[in,out] str the base64 encoded string
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @param behavior the behavior when an error was detected
														
 
															-     @throws base64::decode_inplace()
														
 
															-    */
														
 
															-    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
														
 
															-                               decoding_behavior behavior = decoding_behavior::moderate)
														
 
															-    {
														
 
															-        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
														
 
															-    }
														
 
															-    /**
														
 
															-     Decodes a char array inplace.
														
 
															-
														
 
															-     @param[in,out] str the string array
														
 
															-     @param size the length of the array
														
 
															-     @param alphabet which alphabet should be used
														
 
															-     @param behavior the behavior when an error was detected
														
 
															-     @returns the pointer to the next element past the last element decoded
														
 
															-     @throws base64::decode_inplace()
														
 
															-    */
														
 
															-    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
														
 
															-                                decoding_behavior behavior = decoding_behavior::moderate)
														
 
															-    {
														
 
															-        return decode(str, str + size, str, alphabet, behavior);
														
 
															-    }
														
 
															-    /**
														
 
															-     Returns the required decoding size for a given size. The value is calculated with the following formula:
														
 
															-
														
 
															-     $$
														
 
															-     \lceil \frac{size}{4} \rceil \cdot 3
														
 
															-     $$
														
 
															-
														
 
															-     @param size the size of the encoded input
														
 
															-     @returns the size of the resulting decoded buffer; this the absolute maximum
														
 
															-    */
														
 
															-    static std::size_t max_decode_size(std::size_t size) noexcept
														
 
															-    {
														
 
															-        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
														
 
															-    }
														
 
															-    /**
														
 
															-     Returns the required encoding size for a given size. The value is calculated with the following formula:
														
 
															-
														
 
															-     $$
														
 
															-     \lceil \frac{size}{3} \rceil \cdot 4
														
 
															-     $$
														
 
															-
														
 
															-     @param size the size of the decoded input
														
 
															-     @returns the size of the resulting encoded buffer
														
 
															-    */
														
 
															-    static std::size_t required_encode_size(std::size_t size) noexcept
														
 
															-    {
														
 
															-        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
														
 
															-    }
														
 
															-
														
 
															-private:
														
 
															-    static std::uint8_t _base64_value(alphabet& alphabet, char c)
														
 
															-    {
														
 
															-        if (c >= 'A' && c <= 'Z') {
														
 
															-            return c - 'A';
														
 
															-        } else if (c >= 'a' && c <= 'z') {
														
 
															-            return c - 'a' + 26;
														
 
															-        } else if (c >= '0' && c <= '9') {
														
 
															-            return c - '0' + 52;
														
 
															-        }
														
 
															-
														
 
															-        // comes down to alphabet
														
 
															-        if (alphabet == alphabet::standard) {
														
 
															-            if (c == '+') {
														
 
															-                return 62;
														
 
															-            } else if (c == '/') {
														
 
															-                return 63;
														
 
															-            }
														
 
															-        } else if (alphabet == alphabet::url_filename_safe) {
														
 
															-            if (c == '-') {
														
 
															-                return 62;
														
 
															-            } else if (c == '_') {
														
 
															-                return 63;
														
 
															-            }
														
 
															-        } // auto detect
														
 
															-        else {
														
 
															-            if (c == '+') {
														
 
															-                alphabet = alphabet::standard;
														
 
															-
														
 
															-                return 62;
														
 
															-            } else if (c == '/') {
														
 
															-                alphabet = alphabet::standard;
														
 
															-
														
 
															-                return 63;
														
 
															-            } else if (c == '-') {
														
 
															-                alphabet = alphabet::url_filename_safe;
														
 
															-
														
 
															-                return 62;
														
 
															-            } else if (c == '_') {
														
 
															-                alphabet = alphabet::url_filename_safe;
														
 
															-
														
 
															-                return 63;
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        throw base64_error("invalid base64 character.");
														
 
															-    }
														
 
															-};
														
 
															-
														
 
															-#endif // !PUBLIC_DOMAIN_BASE64_HPP_
														
 
															+/*

														
 
															+This is free and unencumbered software released into the public domain.

														
 
															+

														
 
															+Anyone is free to copy, modify, publish, use, compile, sell, or

														
 
															+distribute this software, either in source code form or as a compiled

														
 
															+binary, for any purpose, commercial or non-commercial, and by any

														
 
															+means.

														
 
															+

														
 
															+In jurisdictions that recognize copyright laws, the author or authors

														
 
															+of this software dedicate any and all copyright interest in the

														
 
															+software to the public domain. We make this dedication for the benefit

														
 
															+of the public at large and to the detriment of our heirs and

														
 
															+successors. We intend this dedication to be an overt act of

														
 
															+relinquishment in perpetuity of all present and future rights to this

														
 
															+software under copyright law.

														
 
															+

														
 
															+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

														
 
															+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

														
 
															+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.

														
 
															+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR

														
 
															+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,

														
 
															+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

														
 
															+OTHER DEALINGS IN THE SOFTWARE.

														
 
															+

														
 
															+For more information, please refer to <http://unlicense.org>

														
 
															+*/

														
 
															+

														
 
															+#ifndef PUBLIC_DOMAIN_BASE64_HPP_

														
 
															+#define PUBLIC_DOMAIN_BASE64_HPP_

														
 
															+

														
 
															+#include <cstdint>

														
 
															+#include <iterator>

														
 
															+#include <stdexcept>

														
 
															+#include <string>

														
 
															+

														
 
															+class base64_error : public std::runtime_error

														
 
															+{

														
 
															+public:

														
 
															+    using std::runtime_error::runtime_error;

														
 
															+};

														
 
															+

														
 
															+class base64

														
 
															+{

														
 
															+public:

														
 
															+    enum class alphabet

														
 
															+    {

														
 
															+        /** the alphabet is detected automatically */

														
 
															+        auto_,

														
 
															+        /** the standard base64 alphabet is used */

														
 
															+        standard,

														
 
															+        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/

														
 
															+        url_filename_safe

														
 
															+    };

														
 
															+

														
 
															+    enum class decoding_behavior

														
 
															+    {

														
 
															+        /** if the input is not padded, the remaining bits are ignored */

														
 
															+        moderate,

														
 
															+        /** if a padding character is encounter decoding is finished */

														
 
															+        loose

														
 
															+    };

														
 
															+

														
 
															+    /**

														
 
															+     Encodes all the elements from `in_begin` to `in_end` to `out`.

														
 
															+

														
 
															+     @warning The source and destination cannot overlap. The destination must be able to hold at least

														
 
															+     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.

														
 
															+

														
 
															+     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than

														
 
															+     8 bits

														
 
															+     @tparam Output_iterator the destination; the elements written to it are from the type `char`

														
 
															+     @param in_begin the beginning of the source

														
 
															+     @param in_end the ending of the source

														
 
															+     @param out the destination iterator

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @returns the iterator to the next element past the last element copied

														
 
															+     @throws see `Input_iterator` and `Output_iterator`

														
 
															+    */

														
 
															+    template<typename Input_iterator, typename Output_iterator>

														
 
															+    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,

														
 
															+                                  alphabet alphabet = alphabet::standard)

														
 
															+    {

														
 
															+        constexpr auto pad = '=';

														
 
															+        const char* alpha  = alphabet == alphabet::url_filename_safe

														
 
															+                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"

														
 
															+                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

														
 
															+

														
 
															+        while (in_begin != in_end) {

														
 
															+            std::uint8_t i0 = 0, i1 = 0, i2 = 0;

														
 
															+

														
 
															+            // first character

														
 
															+            i0 = static_cast<std::uint8_t>(*in_begin);

														
 
															+            ++in_begin;

														
 
															+

														
 
															+            *out = alpha[i0 >> 2 & 0x3f];

														
 
															+            ++out;

														
 
															+

														
 
															+            // part of first character and second

														
 
															+            if (in_begin != in_end) {

														
 
															+                i1 = static_cast<std::uint8_t>(*in_begin);

														
 
															+                ++in_begin;

														
 
															+

														
 
															+                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];

														
 
															+                ++out;

														
 
															+            } else {

														
 
															+                *out = alpha[(i0 & 0x3) << 4];

														
 
															+                ++out;

														
 
															+

														
 
															+                // last padding

														
 
															+                *out = pad;

														
 
															+                ++out;

														
 
															+

														
 
															+                // last padding

														
 
															+                *out = pad;

														
 
															+                ++out;

														
 
															+

														
 
															+                break;

														
 
															+            }

														
 
															+

														
 
															+            // part of second character and third

														
 
															+            if (in_begin != in_end) {

														
 
															+                i2 = static_cast<std::uint8_t>(*in_begin);

														
 
															+                ++in_begin;

														
 
															+

														
 
															+                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];

														
 
															+                ++out;

														
 
															+            } else {

														
 
															+                *out = alpha[(i1 & 0xf) << 2];

														
 
															+                ++out;

														
 
															+

														
 
															+                // last padding

														
 
															+                *out = pad;

														
 
															+                ++out;

														
 
															+

														
 
															+                break;

														
 
															+            }

														
 
															+

														
 
															+            // rest of third

														
 
															+            *out = alpha[i2 & 0x3f];

														
 
															+            ++out;

														
 
															+        }

														
 
															+

														
 
															+        return out;

														
 
															+    }

														
 
															+    /**

														
 
															+     Encodes a string.

														
 
															+

														
 
															+     @param str the string that should be encoded

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @returns the encoded base64 string

														
 
															+     @throws see base64::encode()

														
 
															+    */

														
 
															+    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)

														
 
															+    {

														
 
															+        std::string result;

														
 
															+

														
 
															+        result.reserve(required_encode_size(str.length()) + 1);

														
 
															+

														
 
															+        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);

														
 
															+

														
 
															+        return result;

														
 
															+    }

														
 
															+    /**

														
 
															+     Encodes a char array.

														
 
															+

														
 
															+     @param buffer the char array

														
 
															+     @param size the size of the array

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @returns the encoded string

														
 
															+    */

														
 
															+    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)

														
 
															+    {

														
 
															+        std::string result;

														
 
															+

														
 
															+        result.reserve(required_encode_size(size) + 1);

														
 
															+

														
 
															+        encode(buffer, buffer + size, std::back_inserter(result), alphabet);

														
 
															+

														
 
															+        return result;

														
 
															+    }

														
 
															+    /**

														
 
															+     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,

														
 
															+     in other words: inplace decoding is possible.

														
 
															+

														
 
															+     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,

														
 
															+     otherwise the behavior depends on the output iterator.

														
 
															+

														
 
															+     @tparam Input_iterator the source; the returned elements are cast to `char`

														
 
															+     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`

														
 
															+     @param in_begin the beginning of the source

														
 
															+     @param in_end the ending of the source

														
 
															+     @param out the destination iterator

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @param behavior the behavior when an error was detected

														
 
															+     @returns the iterator to the next element past the last element copied

														
 
															+     @throws base64_error depending on the set behavior

														
 
															+     @throws see `Input_iterator` and `Output_iterator`

														
 
															+    */

														
 
															+    template<typename Input_iterator, typename Output_iterator>

														
 
															+    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,

														
 
															+                                  alphabet alphabet          = alphabet::auto_,

														
 
															+                                  decoding_behavior behavior = decoding_behavior::moderate)

														
 
															+    {

														
 
															+        //constexpr auto pad = '=';

														
 
															+        std::uint8_t last  = 0;

														
 
															+        auto bits          = 0;

														
 
															+

														
 
															+        while (in_begin != in_end) {

														
 
															+            auto c = *in_begin;

														
 
															+            ++in_begin;

														
 
															+

														
 
															+            if (c == '=') {

														
 
															+                break;

														
 
															+            }

														
 
															+

														
 
															+            auto part = _base64_value(alphabet, c);

														
 
															+

														
 
															+            // enough bits for one byte

														
 
															+            if (bits + 6 >= 8) {

														
 
															+                *out = (last << (8 - bits)) | (part >> (bits - 2));

														
 
															+                ++out;

														
 
															+

														
 
															+                bits -= 2;

														
 
															+            } else {

														
 
															+                bits += 6;

														
 
															+            }

														
 
															+

														
 
															+            last = part;

														
 
															+        }

														
 
															+

														
 
															+        // check padding

														
 
															+        if (behavior != decoding_behavior::loose) {

														
 
															+            while (in_begin != in_end) {

														
 
															+                auto c = *in_begin;

														
 
															+                ++in_begin;

														
 
															+

														
 
															+                if (c != '=') {

														
 
															+                    throw base64_error("invalid base64 character.");

														
 
															+                }

														
 
															+            }

														
 
															+        }

														
 
															+

														
 
															+        return out;

														
 
															+    }

														
 
															+    /**

														
 
															+     Decodes a string.

														
 
															+

														
 
															+     @param str the base64 encoded string

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @param behavior the behavior when an error was detected

														
 
															+     @returns the decoded string

														
 
															+     @throws see base64::decode()

														
 
															+    */

														
 
															+    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,

														
 
															+                              decoding_behavior behavior = decoding_behavior::moderate)

														
 
															+    {

														
 
															+        std::string result;

														
 
															+

														
 
															+        result.reserve(max_decode_size(str.length()));

														
 
															+

														
 
															+        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);

														
 
															+

														
 
															+        return result;

														
 
															+    }

														
 
															+    /**

														
 
															+     Decodes a string.

														
 
															+

														
 
															+     @param buffer the base64 encoded buffer

														
 
															+     @param size the size of the buffer

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @param behavior the behavior when an error was detected

														
 
															+     @returns the decoded string

														
 
															+     @throws see base64::decode()

														
 
															+    */

														
 
															+    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,

														
 
															+                              decoding_behavior behavior = decoding_behavior::moderate)

														
 
															+    {

														
 
															+        std::string result;

														
 
															+

														
 
															+        result.reserve(max_decode_size(size));

														
 
															+

														
 
															+        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);

														
 
															+

														
 
															+        return result;

														
 
															+    }

														
 
															+    /**

														
 
															+     Decodes a string inplace.

														
 
															+

														
 
															+     @param[in,out] str the base64 encoded string

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @param behavior the behavior when an error was detected

														
 
															+     @throws base64::decode_inplace()

														
 
															+    */

														
 
															+    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,

														
 
															+                               decoding_behavior behavior = decoding_behavior::moderate)

														
 
															+    {

														
 
															+        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());

														
 
															+    }

														
 
															+    /**

														
 
															+     Decodes a char array inplace.

														
 
															+

														
 
															+     @param[in,out] str the string array

														
 
															+     @param size the length of the array

														
 
															+     @param alphabet which alphabet should be used

														
 
															+     @param behavior the behavior when an error was detected

														
 
															+     @returns the pointer to the next element past the last element decoded

														
 
															+     @throws base64::decode_inplace()

														
 
															+    */

														
 
															+    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,

														
 
															+                                decoding_behavior behavior = decoding_behavior::moderate)

														
 
															+    {

														
 
															+        return decode(str, str + size, str, alphabet, behavior);

														
 
															+    }

														
 
															+    /**

														
 
															+     Returns the required decoding size for a given size. The value is calculated with the following formula:

														
 
															+

														
 
															+     $$

														
 
															+     \lceil \frac{size}{4} \rceil \cdot 3

														
 
															+     $$

														
 
															+

														
 
															+     @param size the size of the encoded input

														
 
															+     @returns the size of the resulting decoded buffer; this the absolute maximum

														
 
															+    */

														
 
															+    static std::size_t max_decode_size(std::size_t size) noexcept

														
 
															+    {

														
 
															+        return (size / 4 + (size % 4 ? 1 : 0)) * 3;

														
 
															+    }

														
 
															+    /**

														
 
															+     Returns the required encoding size for a given size. The value is calculated with the following formula:

														
 
															+

														
 
															+     $$

														
 
															+     \lceil \frac{size}{3} \rceil \cdot 4

														
 
															+     $$

														
 
															+

														
 
															+     @param size the size of the decoded input

														
 
															+     @returns the size of the resulting encoded buffer

														
 
															+    */

														
 
															+    static std::size_t required_encode_size(std::size_t size) noexcept

														
 
															+    {

														
 
															+        return (size / 3 + (size % 3 ? 1 : 0)) * 4;

														
 
															+    }

														
 
															+

														
 
															+private:

														
 
															+    static std::uint8_t _base64_value(alphabet& alphabet, char c)

														
 
															+    {

														
 
															+        if (c >= 'A' && c <= 'Z') {

														
 
															+            return c - 'A';

														
 
															+        } else if (c >= 'a' && c <= 'z') {

														
 
															+            return c - 'a' + 26;

														
 
															+        } else if (c >= '0' && c <= '9') {

														
 
															+            return c - '0' + 52;

														
 
															+        }

														
 
															+

														
 
															+        // comes down to alphabet

														
 
															+        if (alphabet == alphabet::standard) {

														
 
															+            if (c == '+') {

														
 
															+                return 62;

														
 
															+            } else if (c == '/') {

														
 
															+                return 63;

														
 
															+            }

														
 
															+        } else if (alphabet == alphabet::url_filename_safe) {

														
 
															+            if (c == '-') {

														
 
															+                return 62;

														
 
															+            } else if (c == '_') {

														
 
															+                return 63;

														
 
															+            }

														
 
															+        } // auto detect

														
 
															+        else {

														
 
															+            if (c == '+') {

														
 
															+                alphabet = alphabet::standard;

														
 
															+

														
 
															+                return 62;

														
 
															+            } else if (c == '/') {

														
 
															+                alphabet = alphabet::standard;

														
 
															+

														
 
															+                return 63;

														
 
															+            } else if (c == '-') {

														
 
															+                alphabet = alphabet::url_filename_safe;

														
 
															+

														
 
															+                return 62;

														
 
															+            } else if (c == '_') {

														
 
															+                alphabet = alphabet::url_filename_safe;

														
 
															+

														
 
															+                return 63;

														
 
															+            }

														
 
															+        }

														
 
															+

														
 
															+        throw base64_error("invalid base64 character.");

														
 
															+    }

														
 
															+};

														
 
															+

														
 
															+#endif // !PUBLIC_DOMAIN_BASE64_HPP_

														
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/build_cuda.sh
+++ b/llama/build_cuda.sh
@@ -9,7 +9,7 @@ else
 
															 fi
														
 
															 nvcc \
														
 
															-    -t 12 \
														
 
															+    -t $(nproc) \
														
 
															     --generate-code=arch=compute_50,code=[compute_50,sm_50] \
														
 
															     --generate-code=arch=compute_52,code=[compute_52,sm_52] \
														
 
															     --generate-code=arch=compute_61,code=[compute_61,sm_61] \
														
@@ -30,9 +30,18 @@ nvcc \
 
															     -use_fast_math \
														
 
															     -link \
														
 
															     -shared \
														
 
															-    -fPIC \
														
 
															     -I. \
														
 
															     -lcuda -lcublas -lcudart -lcublasLt \
														
 
															     -O3 \
														
 
															     -o $output \
														
 
															-    ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
														
 
															+    ggml-cuda.cu \
														
 
															+    ggml-cuda/*.cu \
														
 
															+    ggml-cuda/template-instances/fattn-wmma*.cu \
														
 
															+    ggml-cuda/template-instances/mmq*.cu \
														
 
															+    ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu \
														
 
															+    ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu \
														
 
															+    ggml-cuda/template-instances/fattn-vec*f16-f16.cu \
														
 
															+    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
														
 
															+
														
 
															+#   -DGGML_CUDA_USE_GRAPHS=1 
														
 
															+#   -DGGML_CUDA_FA_ALL_QUANTS=1
														
--- a/llama/build_hipblas.sh
+++ b/llama/build_hipblas.sh
@@ -26,7 +26,7 @@ additional_flags=""
 
															 if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
														
 
															     output="ggml-hipblas.dll"
														
 
															-    additional_flags=" -Xclang --dependent-lib=msvcrt -Wl,/subsystem:console"
														
 
															+    additional_flags=" -Xclang --dependent-lib=msvcrt"
														
 
															 else
														
 
															     output="libggml-hipblas.so"
														
 
															     archs+=("${linux_archs[@]}")
														
@@ -36,37 +36,61 @@ for arch in "${archs[@]}"; do
 
															     additional_flags+=" --offload-arch=$arch"
														
 
															 done
														
 
															-hipcc \
														
 
															-    -v \
														
 
															-    -parallel-jobs=12 \
														
 
															-    -O3 \
														
 
															-    -DGGML_USE_CUDA \
														
 
															-    -DGGML_BUILD=1 \
														
 
															-    -DGGML_SHARED=1 \
														
 
															-    -DGGML_CUDA_DMMV_X=32 \
														
 
															-    -DGGML_CUDA_MMV_Y=1 \
														
 
															-    -DGGML_SCHED_MAX_COPIES=4 \
														
 
															-    -DGGML_USE_HIPBLAS \
														
 
															-    -DGGML_USE_LLAMAFILE \
														
 
															-    -DHIP_FAST_MATH \
														
 
															-    -DNDEBUG \
														
 
															-    -DK_QUANTS_PER_ITERATION=2 \
														
 
															-    -D_CRT_SECURE_NO_WARNINGS \
														
 
															-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
														
 
															-    -D_GNU_SOURCE \
														
 
															-    -Wno-expansion-to-defined \
														
 
															-    -Wno-invalid-noreturn \
														
 
															-    -Wno-ignored-attributes \
														
 
															-    -Wno-pass-failed \
														
 
															-    -Wno-deprecated-declarations \
														
 
															-    -Wno-unused-result \
														
 
															-    -I. \
														
 
															-    -lhipblas -lamdhip64 -lrocblas \
														
 
															-    -shared \
														
 
															-    $additional_flags \
														
 
															-    -o $output \
														
 
															-    ggml-cuda.cu ggml-cuda/*.cu ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
														
 
															+# Create an array of all source files, expanding globs
														
 
															+sources=(
														
 
															+    $(echo ggml-cuda/template-instances/fattn-wmma*.cu)
														
 
															+    $(echo ggml-cuda/template-instances/mmq*.cu)
														
 
															+    $(echo ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)
														
 
															+    $(echo ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)
														
 
															+    $(echo ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
														
 
															+    ggml-cuda.cu
														
 
															+    $(echo ggml-cuda/*.cu)
														
 
															+    ggml.c
														
 
															+    ggml-backend.c
														
 
															+    ggml-alloc.c
														
 
															+    ggml-quants.c
														
 
															+    sgemm.cpp
														
 
															+)
														
 
															+
														
 
															+# Function to compile a single source file
														
 
															+compile_source() {
														
 
															+    src="$1"
														
 
															+    hipcc -c -O3 -DGGML_USE_CUDA -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 \
														
 
															+          -DGGML_SCHED_MAX_COPIES=4 -DGGML_USE_HIPBLAS -DGGML_USE_LLAMAFILE -DHIP_FAST_MATH -DNDEBUG \
														
 
															+          -DK_QUANTS_PER_ITERATION=2 -D_CRT_SECURE_NO_WARNINGS -DCMAKE_POSITION_INDEPENDENT_CODE=on \
														
 
															+          -D_GNU_SOURCE -Wno-expansion-to-defined -Wno-invalid-noreturn -Wno-ignored-attributes -Wno-pass-failed \
														
 
															+          -Wno-deprecated-declarations -Wno-unused-result -I. \
														
 
															+          $additional_flags -o "${src%.cu}.o" "$src"
														
 
															+}
														
 
															+
														
 
															+# Function to handle Ctrl+C
														
 
															+cleanup() {
														
 
															+    echo "Terminating all background processes..."
														
 
															+    kill 0
														
 
															+}
														
 
															+
														
 
															+# Set trap to handle SIGINT (Ctrl+C)
														
 
															+trap cleanup SIGINT
														
 
															+
														
 
															+# Limit the number of concurrent jobs
														
 
															+max_jobs=$(nproc)
														
 
															+job_count=0
														
 
															+
														
 
															+for src in "${sources[@]}"; do
														
 
															+    echo "$src"
														
 
															+    compile_source "$src" &
														
 
															+    job_count=$((job_count + 1))
														
 
															+    if [[ $job_count -ge $max_jobs ]]; then
														
 
															+        wait -n
														
 
															+        job_count=$((job_count - 1))
														
 
															+    fi
														
 
															+done
														
 
															+
														
 
															+wait
														
 
															+
														
 
															+# Link all object files into a shared library
														
 
															+echo "Linking object files..."
														
 
															+hipcc -v -shared -o $output *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o -lhipblas -lamdhip64 -lrocblas
														
 
															-    # -D_DLL \
														
 
															-    # -D_MT \
														
 
															-    # -D_XOPEN_SOURCE=600 \
														
 
															+# Clean up object files after linking
														
 
															+rm -f *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o
														
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -226,19 +226,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
 
															             }
														
 
															             params.hf_file = params.model;
														
 
															         } else if (params.model.empty()) {
														
 
															-            std::string cache_directory = fs_get_cache_directory();
														
 
															-            const bool success = fs_create_directory_with_parents(cache_directory);
														
 
															-            if (!success) {
														
 
															-                throw std::runtime_error("failed to create cache directory: " + cache_directory);
														
 
															-            }
														
 
															-            params.model = cache_directory + string_split(params.hf_file, '/').back();
														
 
															+            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
														
 
															         }
														
 
															     } else if (!params.model_url.empty()) {
														
 
															         if (params.model.empty()) {
														
 
															             auto f = string_split(params.model_url, '#').front();
														
 
															             f = string_split(f, '?').front();
														
 
															-            f = string_split(f, '/').back();
														
 
															-            params.model =  "models/" + f;
														
 
															+            params.model = fs_get_cache_file(string_split(f, '/').back());
														
 
															         }
														
 
															     } else if (params.model.empty()) {
														
 
															         params.model = DEFAULT_MODEL_PATH;
														
@@ -1517,6 +1511,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 
															         params.chat_template = argv[i];
														
 
															         return true;
														
 
															     }
														
 
															+    if (arg == "--slot-prompt-similarity" || arg == "-sps") {
														
 
															+        if (++i >= argc) {
														
 
															+            invalid_param = true;
														
 
															+            return true;
														
 
															+        }
														
 
															+        params.slot_prompt_similarity = std::stof(argv[i]);
														
 
															+        return true;
														
 
															+    }
														
 
															     if (arg == "-pps") {
														
 
															         params.is_pp_shared = true;
														
 
															         return true;
														
@@ -1939,6 +1941,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
															                                                                         "set custom jinja chat template (default: template taken from model's metadata)\n"
														
 
															                                                                         "only commonly used templates are accepted:\n"
														
 
															                                                                         "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
														
 
															+    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
														
 
															+                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
														
 
															 #ifndef LOG_DISABLE_LOGS
														
 
															     options.push_back({ "logging" });
														
@@ -2295,6 +2299,16 @@ std::string fs_get_cache_directory() {
 
															     return ensure_trailing_slash(cache_directory);
														
 
															 }
														
 
															+std::string fs_get_cache_file(const std::string & filename) {
														
 
															+    GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
														
 
															+    std::string cache_directory = fs_get_cache_directory();
														
 
															+    const bool success = fs_create_directory_with_parents(cache_directory);
														
 
															+    if (!success) {
														
 
															+        throw std::runtime_error("failed to create cache directory: " + cache_directory);
														
 
															+    }
														
 
															+    return cache_directory + filename;
														
 
															+}
														
 
															+
														
 
															 //
														
 
															 // Model utils
														
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -229,6 +229,8 @@ struct gpt_params {
 
															     std::string slot_save_path;
														
 
															+    float slot_prompt_similarity = 0.5f;
														
 
															+
														
 
															     // batched-bench params
														
 
															     bool is_pp_shared = false;
														
@@ -301,6 +303,7 @@ bool fs_validate_filename(const std::string & filename);
 
															 bool fs_create_directory_with_parents(const std::string & path);
														
 
															 std::string fs_get_cache_directory();
														
 
															+std::string fs_get_cache_file(const std::string & filename);
														
 
															 //
														
 
															 // Model utils
														
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -24,79 +24,79 @@
 
															  * SOFTWARE.
														
 
															  */
														
 
															-#pragma once
														
 
															-
														
 
															-#include "ggml.h"
														
 
															-
														
 
															-#ifdef  __cplusplus
														
 
															-extern "C" {
														
 
															-#endif
														
 
															-
														
 
															-typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
														
 
															-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
														
 
															-typedef struct ggml_backend * ggml_backend_t;
														
 
															-
														
 
															-// Tensor allocator
														
 
															-struct ggml_tallocr {
														
 
															-    ggml_backend_buffer_t buffer;
														
 
															-    void * base;
														
 
															-    size_t alignment;
														
 
															-    size_t offset;
														
 
															-};
														
 
															-
														
 
															-GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
														
 
															-GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
														
 
															-
														
 
															-// Graph allocator
														
 
															-/*
														
 
															-  Example usage:
														
 
															-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
														
 
															-
														
 
															-    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
														
 
															-    ggml_gallocr_reserve(galloc, build_graph(max_batch));
														
 
															-
														
 
															-    // allocate the graph
														
 
															-    struct ggml_cgraph * graph = build_graph(batch);
														
 
															-    ggml_gallocr_alloc_graph(galloc, graph);
														
 
															-
														
 
															-    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
														
 
															-
														
 
															-    // evaluate the graph
														
 
															-    ggml_backend_graph_compute(backend, graph);
														
 
															-*/
														
 
															-
														
 
															-// special tensor flags for use with the graph allocator:
														
 
															-//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
														
 
															-//   ggml_set_output(): output tensors are never freed and never overwritten
														
 
															-
														
 
															-typedef struct ggml_gallocr * ggml_gallocr_t;
														
 
															-
														
 
															-GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
														
 
															-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
														
 
															-GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
														
 
															-
														
 
															-// pre-allocate buffers from a measure graph - does not allocate or modify the graph
														
 
															-// call with a worst-case graph to avoid buffer reallocations
														
 
															-// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
														
 
															-// returns false if the buffer allocation failed
														
 
															-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
														
 
															-GGML_API bool ggml_gallocr_reserve_n(
														
 
															-    ggml_gallocr_t galloc,
														
 
															-    struct ggml_cgraph * graph,
														
 
															-    const int * node_buffer_ids,
														
 
															-    const int * leaf_buffer_ids);
														
 
															-
														
 
															-// automatic reallocation if the topology changes when using a single buffer
														
 
															-// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
														
 
															-GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
														
 
															-
														
 
															-GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
														
 
															-
														
 
															-// Utils
														
 
															-// Create a buffer and allocate all the tensors in a ggml_context
														
 
															-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
														
 
															-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
														
 
															-
														
 
															-#ifdef  __cplusplus
														
 
															-}
														
 
															-#endif
														
 
															+#pragma once

														
 
															+

														
 
															+#include "ggml.h"

														
 
															+

														
 
															+#ifdef  __cplusplus

														
 
															+extern "C" {

														
 
															+#endif

														
 
															+

														
 
															+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;

														
 
															+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;

														
 
															+typedef struct ggml_backend * ggml_backend_t;

														
 
															+

														
 
															+// Tensor allocator

														
 
															+struct ggml_tallocr {

														
 
															+    ggml_backend_buffer_t buffer;

														
 
															+    void * base;

														
 
															+    size_t alignment;

														
 
															+    size_t offset;

														
 
															+};

														
 
															+

														
 
															+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);

														
 
															+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);

														
 
															+

														
 
															+// Graph allocator

														
 
															+/*

														
 
															+  Example usage:

														
 
															+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());

														
 
															+

														
 
															+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations

														
 
															+    ggml_gallocr_reserve(galloc, build_graph(max_batch));

														
 
															+

														
 
															+    // allocate the graph

														
 
															+    struct ggml_cgraph * graph = build_graph(batch);

														
 
															+    ggml_gallocr_alloc_graph(galloc, graph);

														
 
															+

														
 
															+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));

														
 
															+

														
 
															+    // evaluate the graph

														
 
															+    ggml_backend_graph_compute(backend, graph);

														
 
															+*/

														
 
															+

														
 
															+// special tensor flags for use with the graph allocator:

														
 
															+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses

														
 
															+//   ggml_set_output(): output tensors are never freed and never overwritten

														
 
															+

														
 
															+typedef struct ggml_gallocr * ggml_gallocr_t;

														
 
															+

														
 
															+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);

														
 
															+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);

														
 
															+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);

														
 
															+

														
 
															+// pre-allocate buffers from a measure graph - does not allocate or modify the graph

														
 
															+// call with a worst-case graph to avoid buffer reallocations

														
 
															+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed

														
 
															+// returns false if the buffer allocation failed

														
 
															+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

														
 
															+GGML_API bool ggml_gallocr_reserve_n(

														
 
															+    ggml_gallocr_t galloc,

														
 
															+    struct ggml_cgraph * graph,

														
 
															+    const int * node_buffer_ids,

														
 
															+    const int * leaf_buffer_ids);

														
 
															+

														
 
															+// automatic reallocation if the topology changes when using a single buffer

														
 
															+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)

														
 
															+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

														
 
															+

														
 
															+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

														
 
															+

														
 
															+// Utils

														
 
															+// Create a buffer and allocate all the tensors in a ggml_context

														
 
															+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);

														
 
															+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

														
 
															+

														
 
															+#ifdef  __cplusplus

														
 
															+}

														
 
															+#endif

														
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -24,144 +24,144 @@
 
															  * SOFTWARE.
														
 
															  */
														
 
															-#pragma once
														
 
															-
														
 
															-// ggml-backend internal header
														
 
															-
														
 
															-#include "ggml-backend.h"
														
 
															-
														
 
															-#ifdef  __cplusplus
														
 
															-extern "C" {
														
 
															-#endif
														
 
															-
														
 
															-    //
														
 
															-    // Backend buffer
														
 
															-    //
														
 
															-
														
 
															-    // buffer type
														
 
															-    typedef void * ggml_backend_buffer_type_context_t;
														
 
															-
														
 
															-    struct ggml_backend_buffer_type_i {
														
 
															-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
														
 
															-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
														
 
															-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
														
 
															-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
														
 
															-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
														
 
															-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
														
 
															-        // check if tensor data is in host memory
														
 
															-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
														
 
															-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
														
 
															-    };
														
 
															-
														
 
															-    struct ggml_backend_buffer_type {
														
 
															-        struct ggml_backend_buffer_type_i  iface;
														
 
															-        ggml_backend_buffer_type_context_t context;
														
 
															-    };
														
 
															-
														
 
															-    // buffer
														
 
															-    typedef void * ggml_backend_buffer_context_t;
														
 
															-
														
 
															-    struct ggml_backend_buffer_i {
														
 
															-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
														
 
															-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
														
 
															-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
														
 
															-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
														
 
															-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
														
 
															-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
														
 
															-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
														
 
															-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
														
 
															-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
														
 
															-    };
														
 
															-
														
 
															-    struct ggml_backend_buffer {
														
 
															-        struct ggml_backend_buffer_i  iface;
														
 
															-        ggml_backend_buffer_type_t    buft;
														
 
															-        ggml_backend_buffer_context_t context;
														
 
															-        size_t size;
														
 
															-        enum ggml_backend_buffer_usage usage;
														
 
															-    };
														
 
															-
														
 
															-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
														
 
															-                   ggml_backend_buffer_type_t      buft,
														
 
															-            struct ggml_backend_buffer_i           iface,
														
 
															-                   ggml_backend_buffer_context_t   context,
														
 
															-                   size_t                          size);
														
 
															-
														
 
															-    // do not use directly, use ggml_backend_tensor_copy instead
														
 
															-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
														
 
															-
														
 
															-    // buffer that contains a collection of buffers
														
 
															-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
														
 
															-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
														
 
															-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
														
 
															-
														
 
															-    //
														
 
															-    // Backend
														
 
															-    //
														
 
															-
														
 
															-    typedef void * ggml_backend_context_t;
														
 
															-
														
 
															-    struct ggml_backend_i {
														
 
															-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
														
 
															-
														
 
															-        void (*GGML_CALL free)(ggml_backend_t backend);
														
 
															-
														
 
															-        // buffer allocation
														
 
															-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
														
 
															-
														
 
															-        // (optional) asynchronous tensor data access
														
 
															-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
														
 
															-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
														
 
															-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
														
 
															-
														
 
															-        // (optional) complete all pending operations
														
 
															-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
														
 
															-
														
 
															-        // compute graph with a plan (not used currently)
														
 
															-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
														
 
															-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
														
 
															-
														
 
															-        // compute graph with a plan
														
 
															-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
														
 
															-        // compute graph without a plan (async)
														
 
															-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
														
 
															-
														
 
															-        // check if the backend supports an operation
														
 
															-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
														
 
															-
														
 
															-        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
														
 
															-        // these should be expensive operations with large batch sizes that may benefit from running on this backend
														
 
															-        // even if the weight has to be copied from the CPU temporarily
														
 
															-        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
														
 
															-
														
 
															-        // (optional) event synchronization
														
 
															-        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
														
 
															-        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
														
 
															-        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
														
 
															-        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
														
 
															-        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
														
 
															-    };
														
 
															-
														
 
															-    struct ggml_backend {
														
 
															-        ggml_guid_t guid;
														
 
															-
														
 
															-        struct ggml_backend_i iface;
														
 
															-        ggml_backend_context_t context;
														
 
															-    };
														
 
															-
														
 
															-    struct ggml_backend_event {
														
 
															-        ggml_backend_t backend;
														
 
															-        void * context;
														
 
															-    };
														
 
															-
														
 
															-    //
														
 
															-    // Backend registry
														
 
															-    //
														
 
															-
														
 
															-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
														
 
															-
														
 
															-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
														
 
															-
														
 
															-#ifdef  __cplusplus
														
 
															-}
														
 
															-#endif
														
 
															+#pragma once

														
 
															+

														
 
															+// ggml-backend internal header

														
 
															+

														
 
															+#include "ggml-backend.h"

														
 
															+

														
 
															+#ifdef  __cplusplus

														
 
															+extern "C" {

														
 
															+#endif

														
 
															+

														
 
															+    //

														
 
															+    // Backend buffer

														
 
															+    //

														
 
															+

														
 
															+    // buffer type

														
 
															+    typedef void * ggml_backend_buffer_type_context_t;

														
 
															+

														
 
															+    struct ggml_backend_buffer_type_i {

														
 
															+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);

														
 
															+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);

														
 
															+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment

														
 
															+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size

														
 
															+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding

														
 
															+        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend

														
 
															+        // check if tensor data is in host memory

														
 
															+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())

														
 
															+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);

														
 
															+    };

														
 
															+

														
 
															+    struct ggml_backend_buffer_type {

														
 
															+        struct ggml_backend_buffer_type_i  iface;

														
 
															+        ggml_backend_buffer_type_context_t context;

														
 
															+    };

														
 
															+

														
 
															+    // buffer

														
 
															+    typedef void * ggml_backend_buffer_context_t;

														
 
															+

														
 
															+    struct ggml_backend_buffer_i {

														
 
															+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);

														
 
															+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);

														
 
															+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);

														
 
															+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

														
 
															+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

														
 
															+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

														
 
															+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer

														
 
															+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);

														
 
															+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras

														
 
															+    };

														
 
															+

														
 
															+    struct ggml_backend_buffer {

														
 
															+        struct ggml_backend_buffer_i  iface;

														
 
															+        ggml_backend_buffer_type_t    buft;

														
 
															+        ggml_backend_buffer_context_t context;

														
 
															+        size_t size;

														
 
															+        enum ggml_backend_buffer_usage usage;

														
 
															+    };

														
 
															+

														
 
															+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(

														
 
															+                   ggml_backend_buffer_type_t      buft,

														
 
															+            struct ggml_backend_buffer_i           iface,

														
 
															+                   ggml_backend_buffer_context_t   context,

														
 
															+                   size_t                          size);

														
 
															+

														
 
															+    // do not use directly, use ggml_backend_tensor_copy instead

														
 
															+    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

														
 
															+

														
 
															+    // buffer that contains a collection of buffers

														
 
															+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);

														
 
															+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);

														
 
															+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

														
 
															+

														
 
															+    //

														
 
															+    // Backend

														
 
															+    //

														
 
															+

														
 
															+    typedef void * ggml_backend_context_t;

														
 
															+

														
 
															+    struct ggml_backend_i {

														
 
															+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);

														
 
															+

														
 
															+        void (*GGML_CALL free)(ggml_backend_t backend);

														
 
															+

														
 
															+        // buffer allocation

														
 
															+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);

														
 
															+

														
 
															+        // (optional) asynchronous tensor data access

														
 
															+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

														
 
															+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

														
 
															+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

														
 
															+

														
 
															+        // (optional) complete all pending operations

														
 
															+        void (*GGML_CALL synchronize)(ggml_backend_t backend);

														
 
															+

														
 
															+        // compute graph with a plan (not used currently)

														
 
															+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);

														
 
															+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

														
 
															+

														
 
															+        // compute graph with a plan

														
 
															+        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

														
 
															+        // compute graph without a plan (async)

														
 
															+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);

														
 
															+

														
 
															+        // check if the backend supports an operation

														
 
															+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

														
 
															+

														
 
															+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer

														
 
															+        // these should be expensive operations with large batch sizes that may benefit from running on this backend

														
 
															+        // even if the weight has to be copied from the CPU temporarily

														
 
															+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);

														
 
															+

														
 
															+        // (optional) event synchronization

														
 
															+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);

														
 
															+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);

														
 
															+        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);

														
 
															+        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);

														
 
															+        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);

														
 
															+    };

														
 
															+

														
 
															+    struct ggml_backend {

														
 
															+        ggml_guid_t guid;

														
 
															+

														
 
															+        struct ggml_backend_i iface;

														
 
															+        ggml_backend_context_t context;

														
 
															+    };

														
 
															+

														
 
															+    struct ggml_backend_event {

														
 
															+        ggml_backend_t backend;

														
 
															+        void * context;

														
 
															+    };

														
 
															+

														
 
															+    //

														
 
															+    // Backend registry

														
 
															+    //

														
 
															+

														
 
															+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);

														
 
															+

														
 
															+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);

														
 
															+

														
 
															+#ifdef  __cplusplus

														
 
															+}

														
 
															+#endif

														
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -1377,10 +1377,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
 
															     GGML_UNUSED(main_device);
														
 
															 }
														
 
															+static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
														
 
															+    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
														
 
															+
														
 
															+#if !defined(GGML_USE_HIPBLAS)
														
 
															+    // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
														
 
															+    cudaMemcpy3DPeerParms p = {};
														
 
															+    p.dstDevice = dstDevice;
														
 
															+    p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
														
 
															+    p.srcDevice = srcDevice;
														
 
															+    p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
														
 
															+    p.extent = make_cudaExtent(width, height, 1);
														
 
															+    return cudaMemcpy3DPeerAsync(&p, stream);
														
 
															+#else
														
 
															+    // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
														
 
															+    GGML_UNUSED(dstDevice);
														
 
															+    GGML_UNUSED(srcDevice);
														
 
															+    return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
														
 
															+#endif // !defined(GGML_USE_HIPBLAS)
														
 
															+}
														
 
															+
														
 
															 static void ggml_cuda_op_mul_mat(
														
 
															     ggml_backend_cuda_context & ctx,
														
 
															     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
														
 
															-    const bool convert_src1_to_q8_1) {
														
 
															+    quantize_cuda_t quantize_src1) {
														
 
															     const int64_t ne00 = src0->ne[0];
														
 
															     const int64_t ne01 = src0->ne[1];
														
@@ -1437,7 +1457,9 @@ static void ggml_cuda_op_mul_mat(
 
															     }
														
 
															     struct dev_data {
														
 
															-        ggml_cuda_pool_alloc<char>  src0_dd_alloc;
														
 
															+        int cc;
														
 
															+
														
 
															+        ggml_cuda_pool_alloc<char>   src0_dd_alloc;
														
 
															         ggml_cuda_pool_alloc<float> src1_ddf_alloc;
														
 
															         ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
														
 
															         ggml_cuda_pool_alloc<float>   dst_dd_alloc;
														
@@ -1456,6 +1478,8 @@ static void ggml_cuda_op_mul_mat(
 
															     int used_devices = 0;
														
 
															     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
														
 
															+        dev[id].cc = ggml_cuda_info().devices[id].cc;
														
 
															+
														
 
															         // by default, use all rows
														
 
															         dev[id].row_low  = 0;
														
 
															         dev[id].row_high = ne01;
														
@@ -1506,11 +1530,15 @@ static void ggml_cuda_op_mul_mat(
 
															             dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
														
 
															         }
														
 
															-        if (convert_src1_to_q8_1) {
														
 
															-            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
														
 
															+        if (quantize_src1) {
														
 
															+            size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
														
 
															+            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
														
 
															+                src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
														
 
															+            }
														
 
															+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
														
 
															             if (src1_on_device && src1_is_contiguous) {
														
 
															-                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
														
 
															+                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
														
 
															                 CUDA_CHECK(cudaGetLastError());
														
 
															             }
														
 
															         }
														
@@ -1556,7 +1584,12 @@ static void ggml_cuda_op_mul_mat(
 
															                 const int64_t i03 = i0 / ne12;
														
 
															                 const int64_t i02 = i0 % ne12;
														
 
															-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
														
 
															+                size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
														
 
															+                if (quantize_src1 == quantize_mmq_q8_1_cuda) {
														
 
															+                    src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
														
 
															+                } else {
														
 
															+                    src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
														
 
															+                }
														
 
															                 // for split tensors the data begins at i0 == i0_offset_low
														
 
															                 char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
														
@@ -1573,10 +1606,17 @@ static void ggml_cuda_op_mul_mat(
 
															                 // copy src0, src1 to device if necessary
														
 
															                 if (src1_is_contiguous) {
														
 
															                     if (id != ctx.device) {
														
 
															-                        if (convert_src1_to_q8_1) {
														
 
															+                        if (quantize_src1) {
														
 
															                             char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
														
 
															-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
														
 
															-                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
														
 
															+                            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
														
 
															+                                const size_t pitch = ne11*sizeof(block_q8_1_mmq);
														
 
															+                                const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
														
 
															+                                const size_t height = src1_padded_col_size/(4*QK8_1);
														
 
															+                                CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
														
 
															+                            } else {
														
 
															+                                CUDA_CHECK(cudaMemcpyPeerAsync(
														
 
															+                                    src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
														
 
															+                            }
														
 
															                         } else {
														
 
															                             float * src1_ddf_i_source = (float *) src1->data;
														
 
															                             src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
														
@@ -1591,8 +1631,8 @@ static void ggml_cuda_op_mul_mat(
 
															                     GGML_ASSERT(false);
														
 
															                 }
														
 
															-                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
														
 
															-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
														
 
															+                if (quantize_src1 && !src1_is_contiguous) {
														
 
															+                    quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
														
 
															                     CUDA_CHECK(cudaGetLastError());
														
 
															                 }
														
@@ -1617,22 +1657,8 @@ static void ggml_cuda_op_mul_mat(
 
															                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
														
 
															                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
														
 
															                         dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
														
 
															-#if !defined(GGML_USE_HIPBLAS)
														
 
															-                        // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
														
 
															-                        cudaMemcpy3DPeerParms p = {};
														
 
															-                        p.dstDevice = ctx.device;
														
 
															-                        p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
														
 
															-                        p.srcDevice = id;
														
 
															-                        p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
														
 
															-                        p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
														
 
															-                        CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
														
 
															-#else
														
 
															-                        // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
														
 
															-                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
														
 
															-                                                        dst_dd_i, row_diff*sizeof(float),
														
 
															-                                                        row_diff*sizeof(float), src1_ncols,
														
 
															-                                                        cudaMemcpyDeviceToDevice, stream));
														
 
															-#endif
														
 
															+                        CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
														
 
															+                            dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
														
 
															                     } else {
														
 
															                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
														
 
															                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
														
@@ -1971,13 +1997,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
 
															         // KQ + KQV multi-batch
														
 
															         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
														
 
															     } else if (use_dequantize_mul_mat_vec) {
														
 
															-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
														
 
															+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
														
 
															     } else if (use_mul_mat_vec_q) {
														
 
															-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
														
 
															+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
														
 
															     } else if (use_mul_mat_q) {
														
 
															-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
														
 
															+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
														
 
															     } else {
														
 
															-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
														
 
															+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
														
 
															     }
														
 
															 }
														
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git d5c938cd7716b9a2ace49a43a469dfbffcff4d28
														
 
															+ * llama.cpp - git e95beeb1fc4621826ddd616776dbdf717366bf5c
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,47 +1,47 @@
 
															-#include "acc.cuh"
														
 
															-
														
 
															-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
														
 
															-    const int ne10, const int ne11, const int ne12,
														
 
															-    const int nb1, const int nb2, int offset) {
														
 
															-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
														
 
															-    if (i >= ne) {
														
 
															-        return;
														
 
															-    }
														
 
															-    int src1_idx = i - offset;
														
 
															-    int oz = src1_idx / nb2;
														
 
															-    int oy = (src1_idx - (oz * nb2)) / nb1;
														
 
															-    int ox = src1_idx % nb1;
														
 
															-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
														
 
															-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
														
 
															-    } else {
														
 
															-        dst[i] = x[i];
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
														
 
															-    const int ne10, const int ne11, const int ne12,
														
 
															-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
														
 
															-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
														
 
															-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const ggml_tensor * src1 = dst->src[1];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    const float * src1_d = (const float *)src1->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
														
 
															-
														
 
															-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
														
 
															-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
														
 
															-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
														
 
															-    int offset = dst->op_params[3] / 4; // offset in bytes
														
 
															-
														
 
															-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
														
 
															-}
														
 
															+#include "acc.cuh"

														
 
															+

														
 
															+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,

														
 
															+    const int ne10, const int ne11, const int ne12,

														
 
															+    const int nb1, const int nb2, int offset) {

														
 
															+    const int i = blockDim.x * blockIdx.x + threadIdx.x;

														
 
															+    if (i >= ne) {

														
 
															+        return;

														
 
															+    }

														
 
															+    int src1_idx = i - offset;

														
 
															+    int oz = src1_idx / nb2;

														
 
															+    int oy = (src1_idx - (oz * nb2)) / nb1;

														
 
															+    int ox = src1_idx % nb1;

														
 
															+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {

														
 
															+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];

														
 
															+    } else {

														
 
															+        dst[i] = x[i];

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,

														
 
															+    const int ne10, const int ne11, const int ne12,

														
 
															+    const int nb1, const int nb2, const int offset, cudaStream_t stream) {

														
 
															+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;

														
 
															+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const ggml_tensor * src1 = dst->src[1];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    const float * src1_d = (const float *)src1->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT(src1->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

														
 
															+

														
 
															+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32

														
 
															+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32

														
 
															+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused

														
 
															+    int offset = dst->op_params[3] / 4; // offset in bytes

														
 
															+

														
 
															+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);

														
 
															+}

														
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_ACC_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_ACC_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,34 +1,34 @@
 
															-#include "arange.cuh"
														
 
															-
														
 
															-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
														
 
															-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
														
 
															-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
														
 
															-    if (nidx >= ne0) {
														
 
															-        return;
														
 
															-    }
														
 
															-    dst[nidx] = start + step * nidx;
														
 
															-}
														
 
															-
														
 
															-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
														
 
															-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
														
 
															-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-
														
 
															-    float start;
														
 
															-    float stop;
														
 
															-    float step;
														
 
															-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
														
 
															-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
														
 
															-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
														
 
															-
														
 
															-    int64_t steps = (int64_t)ceil((stop - start) / step);
														
 
															-    GGML_ASSERT(ggml_nelements(dst) == steps);
														
 
															-
														
 
															-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
														
 
															-}
														
 
															+#include "arange.cuh"

														
 
															+

														
 
															+static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {

														
 
															+    // blockIDx.x: idx of ne0 / BLOCK_SIZE

														
 
															+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

														
 
															+    if (nidx >= ne0) {

														
 
															+        return;

														
 
															+    }

														
 
															+    dst[nidx] = start + step * nidx;

														
 
															+}

														
 
															+

														
 
															+static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {

														
 
															+    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;

														
 
															+    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

														
 
															+

														
 
															+    float start;

														
 
															+    float stop;

														
 
															+    float step;

														
 
															+    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));

														
 
															+    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));

														
 
															+    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));

														
 
															+

														
 
															+    int64_t steps = (int64_t)ceil((stop - start) / step);

														
 
															+    GGML_ASSERT(ggml_nelements(dst) == steps);

														
 
															+

														
 
															+    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);

														
 
															+}

														
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_ARANGE_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_ARANGE_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,103 +1,103 @@
 
															-#include "argsort.cuh"
														
 
															-
														
 
															-template<typename T>
														
 
															-static inline __device__ void ggml_cuda_swap(T & a, T & b) {
														
 
															-    T tmp = a;
														
 
															-    a = b;
														
 
															-    b = tmp;
														
 
															-}
														
 
															-
														
 
															-template<ggml_sort_order order>
														
 
															-static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
														
 
															-    // bitonic sort
														
 
															-    int col = threadIdx.x;
														
 
															-    int row = blockIdx.y;
														
 
															-
														
 
															-    if (col >= ncols_pad) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const float * x_row = x + row * ncols;
														
 
															-    extern __shared__ int dst_row[];
														
 
															-
														
 
															-    // initialize indices
														
 
															-    dst_row[col] = col;
														
 
															-
														
 
															-    __syncthreads();
														
 
															-
														
 
															-    for (int k = 2; k <= ncols_pad; k *= 2) {
														
 
															-        for (int j = k / 2; j > 0; j /= 2) {
														
 
															-            int ixj = col ^ j;
														
 
															-            if (ixj > col) {
														
 
															-                if ((col & k) == 0) {
														
 
															-                    if (dst_row[col] >= ncols ||
														
 
															-                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
														
 
															-                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
														
 
															-                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
														
 
															-                    ) {
														
 
															-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
														
 
															-                    }
														
 
															-                } else {
														
 
															-                    if (dst_row[ixj] >= ncols ||
														
 
															-                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
														
 
															-                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
														
 
															-                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
														
 
															-                    ) {
														
 
															-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
														
 
															-                    }
														
 
															-                }
														
 
															-            }
														
 
															-            __syncthreads();
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    // copy the result to dst without the padding
														
 
															-    if (col < ncols) {
														
 
															-        dst[row * ncols + col] = dst_row[col];
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-static int next_power_of_2(int x) {
														
 
															-    int n = 1;
														
 
															-    while (n < x) {
														
 
															-        n *= 2;
														
 
															-    }
														
 
															-    return n;
														
 
															-}
														
 
															-
														
 
															-static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
														
 
															-    // bitonic sort requires ncols to be power of 2
														
 
															-    const int ncols_pad = next_power_of_2(ncols);
														
 
															-
														
 
															-    const dim3 block_dims(ncols_pad, 1, 1);
														
 
															-    const dim3 block_nums(1, nrows, 1);
														
 
															-    const size_t shared_mem = ncols_pad * sizeof(int);
														
 
															-
														
 
															-    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
														
 
															-
														
 
															-    if (order == GGML_SORT_ORDER_ASC) {
														
 
															-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
														
 
															-    } else if (order == GGML_SORT_ORDER_DESC) {
														
 
															-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
														
 
															-    } else {
														
 
															-        GGML_ASSERT(false);
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
														
 
															-    GGML_ASSERT(ggml_is_contiguous(src0));
														
 
															-
														
 
															-    const int64_t ncols = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															-
														
 
															-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
														
 
															-
														
 
															-    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
														
 
															-}
														
 
															+#include "argsort.cuh"

														
 
															+

														
 
															+template<typename T>

														
 
															+static inline __device__ void ggml_cuda_swap(T & a, T & b) {

														
 
															+    T tmp = a;

														
 
															+    a = b;

														
 
															+    b = tmp;

														
 
															+}

														
 
															+

														
 
															+template<ggml_sort_order order>

														
 
															+static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {

														
 
															+    // bitonic sort

														
 
															+    int col = threadIdx.x;

														
 
															+    int row = blockIdx.y;

														
 
															+

														
 
															+    if (col >= ncols_pad) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const float * x_row = x + row * ncols;

														
 
															+    extern __shared__ int dst_row[];

														
 
															+

														
 
															+    // initialize indices

														
 
															+    dst_row[col] = col;

														
 
															+

														
 
															+    __syncthreads();

														
 
															+

														
 
															+    for (int k = 2; k <= ncols_pad; k *= 2) {

														
 
															+        for (int j = k / 2; j > 0; j /= 2) {

														
 
															+            int ixj = col ^ j;

														
 
															+            if (ixj > col) {

														
 
															+                if ((col & k) == 0) {

														
 
															+                    if (dst_row[col] >= ncols ||

														
 
															+                        (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?

														
 
															+                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :

														
 
															+                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))

														
 
															+                    ) {

														
 
															+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);

														
 
															+                    }

														
 
															+                } else {

														
 
															+                    if (dst_row[ixj] >= ncols ||

														
 
															+                        (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?

														
 
															+                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :

														
 
															+                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))

														
 
															+                    ) {

														
 
															+                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);

														
 
															+                    }

														
 
															+                }

														
 
															+            }

														
 
															+            __syncthreads();

														
 
															+        }

														
 
															+    }

														
 
															+

														
 
															+    // copy the result to dst without the padding

														
 
															+    if (col < ncols) {

														
 
															+        dst[row * ncols + col] = dst_row[col];

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+static int next_power_of_2(int x) {

														
 
															+    int n = 1;

														
 
															+    while (n < x) {

														
 
															+        n *= 2;

														
 
															+    }

														
 
															+    return n;

														
 
															+}

														
 
															+

														
 
															+static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {

														
 
															+    // bitonic sort requires ncols to be power of 2

														
 
															+    const int ncols_pad = next_power_of_2(ncols);

														
 
															+

														
 
															+    const dim3 block_dims(ncols_pad, 1, 1);

														
 
															+    const dim3 block_nums(1, nrows, 1);

														
 
															+    const size_t shared_mem = ncols_pad * sizeof(int);

														
 
															+

														
 
															+    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);

														
 
															+

														
 
															+    if (order == GGML_SORT_ORDER_ASC) {

														
 
															+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);

														
 
															+    } else if (order == GGML_SORT_ORDER_DESC) {

														
 
															+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);

														
 
															+    } else {

														
 
															+        GGML_ASSERT(false);

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_I32);

														
 
															+    GGML_ASSERT(ggml_is_contiguous(src0));

														
 
															+

														
 
															+    const int64_t ncols = src0->ne[0];

														
 
															+    const int64_t nrows = ggml_nrows(src0);

														
 
															+

														
 
															+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

														
 
															+

														
 
															+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);

														
 
															+}

														
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,3 +1,3 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,280 +1,280 @@
 
															-#include "binbcast.cuh"
														
 
															-
														
 
															-static __device__ __forceinline__ float op_repeat(const float a, const float b) {
														
 
															-    return b;
														
 
															-    GGML_UNUSED(a);
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ float op_add(const float a, const float b) {
														
 
															-    return a + b;
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ float op_mul(const float a, const float b) {
														
 
															-    return a * b;
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ float op_div(const float a, const float b) {
														
 
															-    return a / b;
														
 
															-}
														
 
															-
														
 
															-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
														
 
															-static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
														
 
															-        int ne0, int ne1, int ne2, int ne3,
														
 
															-        int ne10, int ne11, int ne12, int ne13,
														
 
															-        /*int s0, */ int s1,  int s2,  int s3,
														
 
															-        /*int s00,*/ int s01, int s02, int s03,
														
 
															-        /*int s10,*/ int s11, int s12, int s13) {
														
 
															-    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															-    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
														
 
															-    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
														
 
															-    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
														
 
															-
														
 
															-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int i11 = i1 % ne11;
														
 
															-    const int i12 = i2 % ne12;
														
 
															-    const int i13 = i3 % ne13;
														
 
															-
														
 
															-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
														
 
															-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
														
 
															-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
														
 
															-
														
 
															-    const src0_t * src0_row = src0 + i_src0;
														
 
															-    const src1_t * src1_row = src1 + i_src1;
														
 
															-    dst_t * dst_row = dst + i_dst;
														
 
															-
														
 
															-    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
														
 
															-        const int i10 = i0 % ne10;
														
 
															-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
														
 
															-static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
														
 
															-        int ne0, int ne1, int ne2, int ne3,
														
 
															-        int ne10, int ne11, int ne12, int ne13,
														
 
															-        /*int s0, */ int s1,  int s2,  int s3,
														
 
															-        /*int s00,*/ int s01, int s02, int s03,
														
 
															-        /*int s10,*/ int s11, int s12, int s13) {
														
 
															-
														
 
															-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															-
														
 
															-    const int i3 = i/(ne2*ne1*ne0);
														
 
															-    const int i2 = (i/(ne1*ne0)) % ne2;
														
 
															-    const int i1 = (i/ne0) % ne1;
														
 
															-    const int i0 = i % ne0;
														
 
															-
														
 
															-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int i11 = i1 % ne11;
														
 
															-    const int i12 = i2 % ne12;
														
 
															-    const int i13 = i3 % ne13;
														
 
															-
														
 
															-    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
														
 
															-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
														
 
															-    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;
														
 
															-
														
 
															-    const src0_t * src0_row = src0 + i_src0;
														
 
															-    const src1_t * src1_row = src1 + i_src1;
														
 
															-    dst_t * dst_row = dst + i_dst;
														
 
															-
														
 
															-    const int i10 = i0 % ne10;
														
 
															-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
														
 
															-}
														
 
															-
														
 
															-template<float (*bin_op)(const float, const float)>
														
 
															-struct bin_bcast_cuda {
														
 
															-    template<typename src0_t, typename src1_t, typename dst_t>
														
 
															-    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,
														
 
															-            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,
														
 
															-            cudaStream_t stream) {
														
 
															-
														
 
															-        GGML_TENSOR_BINARY_OP_LOCALS
														
 
															-
														
 
															-        int nr0 = ne10/ne0;
														
 
															-        int nr1 = ne11/ne1;
														
 
															-        int nr2 = ne12/ne2;
														
 
															-        int nr3 = ne13/ne3;
														
 
															-
														
 
															-        int nr[4] = { nr0, nr1, nr2, nr3 };
														
 
															-
														
 
															-        // collapse dimensions until first broadcast dimension
														
 
															-        int64_t cne[] = {ne0, ne1, ne2, ne3};
														
 
															-        int64_t cne0[] = {ne00, ne01, ne02, ne03};
														
 
															-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
														
 
															-
														
 
															-        size_t cnb[] = {nb0, nb1, nb2, nb3};
														
 
															-        size_t cnb0[] = {nb00, nb01, nb02, nb03};
														
 
															-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
														
 
															-
														
 
															-        auto collapse = [](int64_t cne[]) {
														
 
															-            cne[0] *= cne[1];
														
 
															-            cne[1] = cne[2];
														
 
															-            cne[2] = cne[3];
														
 
															-            cne[3] = 1;
														
 
															-        };
														
 
															-
														
 
															-        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
														
 
															-            cnb[1] *= cne[1];
														
 
															-            cnb[2] *= cne[2];
														
 
															-            cnb[3] *= cne[3];
														
 
															-        };
														
 
															-
														
 
															-        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
														
 
															-            for (int i = 0; i < 4; i++) {
														
 
															-                if (nr[i] != 1) {
														
 
															-                    break;
														
 
															-                }
														
 
															-                if (i > 0) {
														
 
															-                    collapse_nb(cnb, cne);
														
 
															-                    collapse_nb(cnb0, cne0);
														
 
															-                    collapse_nb(cnb1, cne1);
														
 
															-                    collapse(cne);
														
 
															-                    collapse(cne0);
														
 
															-                    collapse(cne1);
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        {
														
 
															-            int64_t ne0 = cne[0];
														
 
															-            int64_t ne1 = cne[1];
														
 
															-            int64_t ne2 = cne[2];
														
 
															-            int64_t ne3 = cne[3];
														
 
															-
														
 
															-            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
														
 
															-            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
														
 
															-            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
														
 
															-            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
														
 
															-
														
 
															-            int64_t ne10 = cne1[0];
														
 
															-            int64_t ne11 = cne1[1];
														
 
															-            int64_t ne12 = cne1[2];
														
 
															-            int64_t ne13 = cne1[3];
														
 
															-
														
 
															-            size_t nb0 = cnb[0];
														
 
															-            size_t nb1 = cnb[1];
														
 
															-            size_t nb2 = cnb[2];
														
 
															-            size_t nb3 = cnb[3];
														
 
															-
														
 
															-            size_t nb00 = cnb0[0];
														
 
															-            size_t nb01 = cnb0[1];
														
 
															-            size_t nb02 = cnb0[2];
														
 
															-            size_t nb03 = cnb0[3];
														
 
															-
														
 
															-            size_t nb10 = cnb1[0];
														
 
															-            size_t nb11 = cnb1[1];
														
 
															-            size_t nb12 = cnb1[2];
														
 
															-            size_t nb13 = cnb1[3];
														
 
															-
														
 
															-            size_t s0 = nb0 / sizeof(dst_t);
														
 
															-            size_t s1 = nb1 / sizeof(dst_t);
														
 
															-            size_t s2 = nb2 / sizeof(dst_t);
														
 
															-            size_t s3 = nb3 / sizeof(dst_t);
														
 
															-
														
 
															-            size_t s10 = nb10 / sizeof(src1_t);
														
 
															-            size_t s11 = nb11 / sizeof(src1_t);
														
 
															-            size_t s12 = nb12 / sizeof(src1_t);
														
 
															-            size_t s13 = nb13 / sizeof(src1_t);
														
 
															-
														
 
															-            size_t s00 = nb00 / sizeof(src0_t);
														
 
															-            size_t s01 = nb01 / sizeof(src0_t);
														
 
															-            size_t s02 = nb02 / sizeof(src0_t);
														
 
															-            size_t s03 = nb03 / sizeof(src0_t);
														
 
															-
														
 
															-            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
														
 
															-            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
														
 
															-            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
														
 
															-            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
														
 
															-
														
 
															-            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
														
 
															-            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
														
 
															-            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
														
 
															-            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
														
 
															-
														
 
															-            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
														
 
															-            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
														
 
															-            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
														
 
															-            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
														
 
															-
														
 
															-            GGML_ASSERT(s0 == 1);
														
 
															-            GGML_ASSERT(s00 == 1);
														
 
															-            GGML_ASSERT(s10 == 1);
														
 
															-
														
 
															-            const int block_size = 128;
														
 
															-
														
 
															-            int64_t hne0 = std::max(ne0/2LL, 1LL);
														
 
															-
														
 
															-            dim3 block_dims;
														
 
															-            block_dims.x = std::min<unsigned int>(hne0, block_size);
														
 
															-            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
														
 
															-            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);
														
 
															-
														
 
															-            dim3 block_nums(
														
 
															-                (hne0 + block_dims.x - 1) / block_dims.x,
														
 
															-                (ne1 + block_dims.y - 1) / block_dims.y,
														
 
															-                (ne2*ne3 + block_dims.z - 1) / block_dims.z
														
 
															-            );
														
 
															-
														
 
															-            if (block_nums.z > 65535) {
														
 
															-                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
														
 
															-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
														
 
															-                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
														
 
															-                    src0_dd, src1_dd, dst_dd,
														
 
															-                    ne0, ne1, ne2, ne3,
														
 
															-                    ne10, ne11, ne12, ne13,
														
 
															-                    /* s0, */ s1, s2, s3,
														
 
															-                    /* s00, */ s01, s02, s03,
														
 
															-                    /* s10, */ s11, s12, s13);
														
 
															-            } else {
														
 
															-                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
														
 
															-                    src0_dd, src1_dd, dst_dd,
														
 
															-                    ne0, ne1, ne2, ne3,
														
 
															-                    ne10, ne11, ne12, ne13,
														
 
															-                    /* s0, */ s1, s2, s3,
														
 
															-                    /* s00, */ s01, s02, s03,
														
 
															-                    /* s10, */ s11, s12, s13);
														
 
															-            }
														
 
															-        }
														
 
															-    }
														
 
															-};
														
 
															-
														
 
															-template<class op>
														
 
															-static void ggml_cuda_op_bin_bcast(
														
 
															-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
														
 
															-    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
														
 
															-
														
 
															-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
														
 
															-
														
 
															-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
														
 
															-        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
														
 
															-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
														
 
															-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
														
 
															-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
														
 
															-        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
														
 
															-    } else {
														
 
															-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
														
 
															-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
														
 
															-        GGML_ASSERT(false);
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
														
 
															-}
														
 
															+#include "binbcast.cuh"

														
 
															+

														
 
															+static __device__ __forceinline__ float op_repeat(const float a, const float b) {

														
 
															+    return b;

														
 
															+    GGML_UNUSED(a);

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ float op_add(const float a, const float b) {

														
 
															+    return a + b;

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ float op_mul(const float a, const float b) {

														
 
															+    return a * b;

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ float op_div(const float a, const float b) {

														
 
															+    return a / b;

														
 
															+}

														
 
															+

														
 
															+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>

														
 
															+static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,

														
 
															+        int ne0, int ne1, int ne2, int ne3,

														
 
															+        int ne10, int ne11, int ne12, int ne13,

														
 
															+        /*int s0, */ int s1,  int s2,  int s3,

														
 
															+        /*int s00,*/ int s01, int s02, int s03,

														
 
															+        /*int s10,*/ int s11, int s12, int s13) {

														
 
															+    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;

														
 
															+    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);

														
 
															+    const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;

														
 
															+    const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;

														
 
															+

														
 
															+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int i11 = i1 % ne11;

														
 
															+    const int i12 = i2 % ne12;

														
 
															+    const int i13 = i3 % ne13;

														
 
															+

														
 
															+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;

														
 
															+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;

														
 
															+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

														
 
															+

														
 
															+    const src0_t * src0_row = src0 + i_src0;

														
 
															+    const src1_t * src1_row = src1 + i_src1;

														
 
															+    dst_t * dst_row = dst + i_dst;

														
 
															+

														
 
															+    for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {

														
 
															+        const int i10 = i0 % ne10;

														
 
															+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>

														
 
															+static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,

														
 
															+        int ne0, int ne1, int ne2, int ne3,

														
 
															+        int ne10, int ne11, int ne12, int ne13,

														
 
															+        /*int s0, */ int s1,  int s2,  int s3,

														
 
															+        /*int s00,*/ int s01, int s02, int s03,

														
 
															+        /*int s10,*/ int s11, int s12, int s13) {

														
 
															+

														
 
															+    const int i = blockDim.x*blockIdx.x + threadIdx.x;

														
 
															+

														
 
															+    const int i3 = i/(ne2*ne1*ne0);

														
 
															+    const int i2 = (i/(ne1*ne0)) % ne2;

														
 
															+    const int i1 = (i/ne0) % ne1;

														
 
															+    const int i0 = i % ne0;

														
 
															+

														
 
															+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int i11 = i1 % ne11;

														
 
															+    const int i12 = i2 % ne12;

														
 
															+    const int i13 = i3 % ne13;

														
 
															+

														
 
															+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;

														
 
															+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;

														
 
															+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

														
 
															+

														
 
															+    const src0_t * src0_row = src0 + i_src0;

														
 
															+    const src1_t * src1_row = src1 + i_src1;

														
 
															+    dst_t * dst_row = dst + i_dst;

														
 
															+

														
 
															+    const int i10 = i0 % ne10;

														
 
															+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);

														
 
															+}

														
 
															+

														
 
															+template<float (*bin_op)(const float, const float)>

														
 
															+struct bin_bcast_cuda {

														
 
															+    template<typename src0_t, typename src1_t, typename dst_t>

														
 
															+    void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst,

														
 
															+            const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd,

														
 
															+            cudaStream_t stream) {

														
 
															+

														
 
															+        GGML_TENSOR_BINARY_OP_LOCALS

														
 
															+

														
 
															+        int nr0 = ne10/ne0;

														
 
															+        int nr1 = ne11/ne1;

														
 
															+        int nr2 = ne12/ne2;

														
 
															+        int nr3 = ne13/ne3;

														
 
															+

														
 
															+        int nr[4] = { nr0, nr1, nr2, nr3 };

														
 
															+

														
 
															+        // collapse dimensions until first broadcast dimension

														
 
															+        int64_t cne[] = {ne0, ne1, ne2, ne3};

														
 
															+        int64_t cne0[] = {ne00, ne01, ne02, ne03};

														
 
															+        int64_t cne1[] = {ne10, ne11, ne12, ne13};

														
 
															+

														
 
															+        size_t cnb[] = {nb0, nb1, nb2, nb3};

														
 
															+        size_t cnb0[] = {nb00, nb01, nb02, nb03};

														
 
															+        size_t cnb1[] = {nb10, nb11, nb12, nb13};

														
 
															+

														
 
															+        auto collapse = [](int64_t cne[]) {

														
 
															+            cne[0] *= cne[1];

														
 
															+            cne[1] = cne[2];

														
 
															+            cne[2] = cne[3];

														
 
															+            cne[3] = 1;

														
 
															+        };

														
 
															+

														
 
															+        auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {

														
 
															+            cnb[1] *= cne[1];

														
 
															+            cnb[2] *= cne[2];

														
 
															+            cnb[3] *= cne[3];

														
 
															+        };

														
 
															+

														
 
															+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {

														
 
															+            for (int i = 0; i < 4; i++) {

														
 
															+                if (nr[i] != 1) {

														
 
															+                    break;

														
 
															+                }

														
 
															+                if (i > 0) {

														
 
															+                    collapse_nb(cnb, cne);

														
 
															+                    collapse_nb(cnb0, cne0);

														
 
															+                    collapse_nb(cnb1, cne1);

														
 
															+                    collapse(cne);

														
 
															+                    collapse(cne0);

														
 
															+                    collapse(cne1);

														
 
															+                }

														
 
															+            }

														
 
															+        }

														
 
															+

														
 
															+        {

														
 
															+            int64_t ne0 = cne[0];

														
 
															+            int64_t ne1 = cne[1];

														
 
															+            int64_t ne2 = cne[2];

														
 
															+            int64_t ne3 = cne[3];

														
 
															+

														
 
															+            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);

														
 
															+            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);

														
 
															+            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);

														
 
															+            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);

														
 
															+

														
 
															+            int64_t ne10 = cne1[0];

														
 
															+            int64_t ne11 = cne1[1];

														
 
															+            int64_t ne12 = cne1[2];

														
 
															+            int64_t ne13 = cne1[3];

														
 
															+

														
 
															+            size_t nb0 = cnb[0];

														
 
															+            size_t nb1 = cnb[1];

														
 
															+            size_t nb2 = cnb[2];

														
 
															+            size_t nb3 = cnb[3];

														
 
															+

														
 
															+            size_t nb00 = cnb0[0];

														
 
															+            size_t nb01 = cnb0[1];

														
 
															+            size_t nb02 = cnb0[2];

														
 
															+            size_t nb03 = cnb0[3];

														
 
															+

														
 
															+            size_t nb10 = cnb1[0];

														
 
															+            size_t nb11 = cnb1[1];

														
 
															+            size_t nb12 = cnb1[2];

														
 
															+            size_t nb13 = cnb1[3];

														
 
															+

														
 
															+            size_t s0 = nb0 / sizeof(dst_t);

														
 
															+            size_t s1 = nb1 / sizeof(dst_t);

														
 
															+            size_t s2 = nb2 / sizeof(dst_t);

														
 
															+            size_t s3 = nb3 / sizeof(dst_t);

														
 
															+

														
 
															+            size_t s10 = nb10 / sizeof(src1_t);

														
 
															+            size_t s11 = nb11 / sizeof(src1_t);

														
 
															+            size_t s12 = nb12 / sizeof(src1_t);

														
 
															+            size_t s13 = nb13 / sizeof(src1_t);

														
 
															+

														
 
															+            size_t s00 = nb00 / sizeof(src0_t);

														
 
															+            size_t s01 = nb01 / sizeof(src0_t);

														
 
															+            size_t s02 = nb02 / sizeof(src0_t);

														
 
															+            size_t s03 = nb03 / sizeof(src0_t);

														
 
															+

														
 
															+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);

														
 
															+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);

														
 
															+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);

														
 
															+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);

														
 
															+

														
 
															+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);

														
 
															+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);

														
 
															+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);

														
 
															+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);

														
 
															+

														
 
															+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);

														
 
															+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);

														
 
															+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);

														
 
															+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);

														
 
															+

														
 
															+            GGML_ASSERT(s0 == 1);

														
 
															+            GGML_ASSERT(s00 == 1);

														
 
															+            GGML_ASSERT(s10 == 1);

														
 
															+

														
 
															+            const int block_size = 128;

														
 
															+

														
 
															+            int64_t hne0 = std::max(ne0/2LL, 1LL);

														
 
															+

														
 
															+            dim3 block_dims;

														
 
															+            block_dims.x = std::min<unsigned int>(hne0, block_size);

														
 
															+            block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);

														
 
															+            block_dims.z = std::min(std::min<unsigned int>(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U);

														
 
															+

														
 
															+            dim3 block_nums(

														
 
															+                (hne0 + block_dims.x - 1) / block_dims.x,

														
 
															+                (ne1 + block_dims.y - 1) / block_dims.y,

														
 
															+                (ne2*ne3 + block_dims.z - 1) / block_dims.z

														
 
															+            );

														
 
															+

														
 
															+            if (block_nums.z > 65535) {

														
 
															+                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel

														
 
															+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;

														
 
															+                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(

														
 
															+                    src0_dd, src1_dd, dst_dd,

														
 
															+                    ne0, ne1, ne2, ne3,

														
 
															+                    ne10, ne11, ne12, ne13,

														
 
															+                    /* s0, */ s1, s2, s3,

														
 
															+                    /* s00, */ s01, s02, s03,

														
 
															+                    /* s10, */ s11, s12, s13);

														
 
															+            } else {

														
 
															+                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(

														
 
															+                    src0_dd, src1_dd, dst_dd,

														
 
															+                    ne0, ne1, ne2, ne3,

														
 
															+                    ne10, ne11, ne12, ne13,

														
 
															+                    /* s0, */ s1, s2, s3,

														
 
															+                    /* s00, */ s01, s02, s03,

														
 
															+                    /* s10, */ s11, s12, s13);

														
 
															+            }

														
 
															+        }

														
 
															+    }

														
 
															+};

														
 
															+

														
 
															+template<class op>

														
 
															+static void ggml_cuda_op_bin_bcast(

														
 
															+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

														
 
															+    const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {

														
 
															+

														
 
															+    GGML_ASSERT(src1->type == GGML_TYPE_F32);

														
 
															+

														
 
															+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {

														
 
															+        op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);

														
 
															+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {

														
 
															+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);

														
 
															+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {

														
 
															+        op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);

														
 
															+    } else {

														
 
															+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,

														
 
															+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));

														
 
															+        GGML_ASSERT(false);

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream());

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());

														
 
															+}

														
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,6 +1,6 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
 
															+void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
 
															+void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
 
															+void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_CLAMP_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_CLAMP_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_CONCAT_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_CONCAT_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,13 +1,13 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
														
 
															-
														
 
															-template<typename T>
														
 
															-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
														
 
															-
														
 
															-typedef to_t_cuda_t<float> to_fp32_cuda_t;
														
 
															-typedef to_t_cuda_t<half> to_fp16_cuda_t;
														
 
															-
														
 
															-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
														
 
															-
														
 
															-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256

														
 
															+

														
 
															+template<typename T>

														
 
															+using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);

														
 
															+

														
 
															+typedef to_t_cuda_t<float> to_fp32_cuda_t;

														
 
															+typedef to_t_cuda_t<half> to_fp16_cuda_t;

														
 
															+

														
 
															+to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);

														
 
															+

														
 
															+to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);

														
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,103 +1,103 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
														
 
															-    const block_q4_0 * x = (const block_q4_0 *) vx;
														
 
															-
														
 
															-    const dfloat d = x[ib].d;
														
 
															-
														
 
															-    const int vui = x[ib].qs[iqs];
														
 
															-
														
 
															-    v.x = vui & 0xF;
														
 
															-    v.y = vui >> 4;
														
 
															-
														
 
															-#ifdef GGML_CUDA_F16
														
 
															-    v = __hsub2(v, {8.0f, 8.0f});
														
 
															-    v = __hmul2(v, {d, d});
														
 
															-#else
														
 
															-    v.x = (v.x - 8.0f) * d;
														
 
															-    v.y = (v.y - 8.0f) * d;
														
 
															-#endif // GGML_CUDA_F16
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
														
 
															-    const block_q4_1 * x = (const block_q4_1 *) vx;
														
 
															-
														
 
															-    const dfloat d = __low2half(x[ib].dm);
														
 
															-    const dfloat m = __high2half(x[ib].dm);
														
 
															-
														
 
															-    const int vui = x[ib].qs[iqs];
														
 
															-
														
 
															-    v.x = vui & 0xF;
														
 
															-    v.y = vui >> 4;
														
 
															-
														
 
															-#ifdef GGML_CUDA_F16
														
 
															-    v = __hmul2(v, {d, d});
														
 
															-    v = __hadd2(v, {m, m});
														
 
															-#else
														
 
															-    v.x = (v.x * d) + m;
														
 
															-    v.y = (v.y * d) + m;
														
 
															-#endif // GGML_CUDA_F16
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
														
 
															-    const block_q5_0 * x = (const block_q5_0 *) vx;
														
 
															-
														
 
															-    const dfloat d = x[ib].d;
														
 
															-
														
 
															-    uint32_t qh;
														
 
															-    memcpy(&qh, x[ib].qh, sizeof(qh));
														
 
															-
														
 
															-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
														
 
															-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
														
 
															-
														
 
															-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
														
 
															-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
														
 
															-
														
 
															-#ifdef GGML_CUDA_F16
														
 
															-    v = __hsub2(v, {16.0f, 16.0f});
														
 
															-    v = __hmul2(v, {d, d});
														
 
															-#else
														
 
															-    v.x = (v.x - 16.0f) * d;
														
 
															-    v.y = (v.y - 16.0f) * d;
														
 
															-#endif // GGML_CUDA_F16
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
														
 
															-    const block_q5_1 * x = (const block_q5_1 *) vx;
														
 
															-
														
 
															-    const dfloat d = __low2half(x[ib].dm);
														
 
															-    const dfloat m = __high2half(x[ib].dm);
														
 
															-
														
 
															-    uint32_t qh;
														
 
															-    memcpy(&qh, x[ib].qh, sizeof(qh));
														
 
															-
														
 
															-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
														
 
															-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
														
 
															-
														
 
															-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
														
 
															-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
														
 
															-
														
 
															-#ifdef GGML_CUDA_F16
														
 
															-    v = __hmul2(v, {d, d});
														
 
															-    v = __hadd2(v, {m, m});
														
 
															-#else
														
 
															-    v.x = (v.x * d) + m;
														
 
															-    v.y = (v.y * d) + m;
														
 
															-#endif // GGML_CUDA_F16
														
 
															-}
														
 
															-
														
 
															-static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
														
 
															-    const block_q8_0 * x = (const block_q8_0 *) vx;
														
 
															-
														
 
															-    const dfloat d = x[ib].d;
														
 
															-
														
 
															-    v.x = x[ib].qs[iqs + 0];
														
 
															-    v.y = x[ib].qs[iqs + 1];
														
 
															-
														
 
															-#ifdef GGML_CUDA_F16
														
 
															-    v = __hmul2(v, {d, d});
														
 
															-#else
														
 
															-    v.x *= d;
														
 
															-    v.y *= d;
														
 
															-#endif // GGML_CUDA_F16
														
 
															-}
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

														
 
															+    const block_q4_0 * x = (const block_q4_0 *) vx;

														
 
															+

														
 
															+    const dfloat d = x[ib].d;

														
 
															+

														
 
															+    const int vui = x[ib].qs[iqs];

														
 
															+

														
 
															+    v.x = vui & 0xF;

														
 
															+    v.y = vui >> 4;

														
 
															+

														
 
															+#ifdef GGML_CUDA_F16

														
 
															+    v = __hsub2(v, {8.0f, 8.0f});

														
 
															+    v = __hmul2(v, {d, d});

														
 
															+#else

														
 
															+    v.x = (v.x - 8.0f) * d;

														
 
															+    v.y = (v.y - 8.0f) * d;

														
 
															+#endif // GGML_CUDA_F16

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

														
 
															+    const block_q4_1 * x = (const block_q4_1 *) vx;

														
 
															+

														
 
															+    const dfloat d = __low2half(x[ib].dm);

														
 
															+    const dfloat m = __high2half(x[ib].dm);

														
 
															+

														
 
															+    const int vui = x[ib].qs[iqs];

														
 
															+

														
 
															+    v.x = vui & 0xF;

														
 
															+    v.y = vui >> 4;

														
 
															+

														
 
															+#ifdef GGML_CUDA_F16

														
 
															+    v = __hmul2(v, {d, d});

														
 
															+    v = __hadd2(v, {m, m});

														
 
															+#else

														
 
															+    v.x = (v.x * d) + m;

														
 
															+    v.y = (v.y * d) + m;

														
 
															+#endif // GGML_CUDA_F16

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

														
 
															+    const block_q5_0 * x = (const block_q5_0 *) vx;

														
 
															+

														
 
															+    const dfloat d = x[ib].d;

														
 
															+

														
 
															+    uint32_t qh;

														
 
															+    memcpy(&qh, x[ib].qh, sizeof(qh));

														
 
															+

														
 
															+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;

														
 
															+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

														
 
															+

														
 
															+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);

														
 
															+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

														
 
															+

														
 
															+#ifdef GGML_CUDA_F16

														
 
															+    v = __hsub2(v, {16.0f, 16.0f});

														
 
															+    v = __hmul2(v, {d, d});

														
 
															+#else

														
 
															+    v.x = (v.x - 16.0f) * d;

														
 
															+    v.y = (v.y - 16.0f) * d;

														
 
															+#endif // GGML_CUDA_F16

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

														
 
															+    const block_q5_1 * x = (const block_q5_1 *) vx;

														
 
															+

														
 
															+    const dfloat d = __low2half(x[ib].dm);

														
 
															+    const dfloat m = __high2half(x[ib].dm);

														
 
															+

														
 
															+    uint32_t qh;

														
 
															+    memcpy(&qh, x[ib].qh, sizeof(qh));

														
 
															+

														
 
															+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;

														
 
															+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;

														
 
															+

														
 
															+    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);

														
 
															+    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);

														
 
															+

														
 
															+#ifdef GGML_CUDA_F16

														
 
															+    v = __hmul2(v, {d, d});

														
 
															+    v = __hadd2(v, {m, m});

														
 
															+#else

														
 
															+    v.x = (v.x * d) + m;

														
 
															+    v.y = (v.y * d) + m;

														
 
															+#endif // GGML_CUDA_F16

														
 
															+}

														
 
															+

														
 
															+static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){

														
 
															+    const block_q8_0 * x = (const block_q8_0 *) vx;

														
 
															+

														
 
															+    const dfloat d = x[ib].d;

														
 
															+

														
 
															+    v.x = x[ib].qs[iqs + 0];

														
 
															+    v.y = x[ib].qs[iqs + 1];

														
 
															+

														
 
															+#ifdef GGML_CUDA_F16

														
 
															+    v = __hmul2(v, {d, d});

														
 
															+#else

														
 
															+    v.x *= d;

														
 
															+    v.y *= d;

														
 
															+#endif // GGML_CUDA_F16

														
 
															+}

														
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,40 +1,40 @@
 
															-#include "diagmask.cuh"
														
 
															-
														
 
															-static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
														
 
															-    const int col = blockDim.y*blockIdx.y + threadIdx.y;
														
 
															-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
														
 
															-
														
 
															-    if (col >= ncols) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int i = row*ncols + col;
														
 
															-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
														
 
															-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
														
 
															-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
														
 
															-}
														
 
															-
														
 
															-static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
														
 
															-    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
														
 
															-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
														
 
															-    const dim3 block_nums(nrows_x, block_num_x, 1);
														
 
															-    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															-
														
 
															-    const int64_t ne00 = src0->ne[0];
														
 
															-    const int64_t ne01 = src0->ne[1];
														
 
															-    const int nrows0 = ggml_nrows(src0);
														
 
															-
														
 
															-    const int n_past = ((int32_t *) dst->op_params)[0];
														
 
															-
														
 
															-    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);
														
 
															-}
														
 
															+#include "diagmask.cuh"

														
 
															+

														
 
															+static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {

														
 
															+    const int col = blockDim.y*blockIdx.y + threadIdx.y;

														
 
															+    const int row = blockDim.x*blockIdx.x + threadIdx.x;

														
 
															+

														
 
															+    if (col >= ncols) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int i = row*ncols + col;

														
 
															+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];

														
 
															+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU

														
 
															+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;

														
 
															+}

														
 
															+

														
 
															+static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {

														
 
															+    const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);

														
 
															+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;

														
 
															+    const dim3 block_nums(nrows_x, block_num_x, 1);

														
 
															+    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

														
 
															+

														
 
															+    const int64_t ne00 = src0->ne[0];

														
 
															+    const int64_t ne01 = src0->ne[1];

														
 
															+    const int nrows0 = ggml_nrows(src0);

														
 
															+

														
 
															+    const int n_past = ((int32_t *) dst->op_params)[0];

														
 
															+

														
 
															+    diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream);

														
 
															+}

														
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
														
 
															-
														
 
															-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32

														
 
															+

														
 
															+void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,18 +1,18 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-// dmmv = dequantize_mul_mat_vec
														
 
															-
														
 
															-// TODO: remove this?
														
 
															-#ifndef GGML_CUDA_DMMV_X
														
 
															-#define GGML_CUDA_DMMV_X 32
														
 
															-#endif
														
 
															-
														
 
															-#ifndef GGML_CUDA_MMV_Y
														
 
															-#define GGML_CUDA_MMV_Y 1
														
 
															-#endif
														
 
															-
														
 
															-void ggml_cuda_op_dequantize_mul_mat_vec(
														
 
															-    ggml_backend_cuda_context & ctx,
														
 
															-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
														
 
															-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
														
 
															-    const int64_t src1_padded_row_size, cudaStream_t stream);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+// dmmv = dequantize_mul_mat_vec

														
 
															+

														
 
															+// TODO: remove this?

														
 
															+#ifndef GGML_CUDA_DMMV_X

														
 
															+#define GGML_CUDA_DMMV_X 32

														
 
															+#endif

														
 
															+

														
 
															+#ifndef GGML_CUDA_MMV_Y

														
 
															+#define GGML_CUDA_MMV_Y 1

														
 
															+#endif

														
 
															+

														
 
															+void ggml_cuda_op_dequantize_mul_mat_vec(

														
 
															+    ggml_backend_cuda_context & ctx,

														
 
															+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

														
 
															+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

														
 
															+    const int64_t src1_padded_row_size, cudaStream_t stream);

														
--- a/llama/ggml-cuda/fattn-vec-f16.cu
+++ b/llama/ggml-cuda/fattn-vec-f16.cu
@@ -1,326 +0,0 @@
 
															-#include "common.cuh"
														
 
															-#include "fattn-common.cuh"
														
 
															-#include "fattn-vec-f16.cuh"
														
 
															-
														
 
															-template<int D, int ncols, int parallel_blocks> // D == head size
														
 
															-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
														
 
															-__launch_bounds__(D, 1)
														
 
															-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
														
 
															-static __global__ void flash_attn_vec_ext_f16(
														
 
															-        const char * __restrict__ Q,
														
 
															-        const char * __restrict__ K,
														
 
															-        const char * __restrict__ V,
														
 
															-        const char * __restrict__ mask,
														
 
															-        float      * __restrict__ dst,
														
 
															-        float2     * __restrict__ dst_meta,
														
 
															-        const float scale,
														
 
															-        const float max_bias,
														
 
															-        const float m0,
														
 
															-        const float m1,
														
 
															-        const uint32_t n_head_log2,
														
 
															-        const int ne00,
														
 
															-        const int ne01,
														
 
															-        const int ne02,
														
 
															-        const int ne03,
														
 
															-        const int ne10,
														
 
															-        const int ne11,
														
 
															-        const int ne12,
														
 
															-        const int ne13,
														
 
															-        const int ne31,
														
 
															-        const int nb31,
														
 
															-        const int nb01,
														
 
															-        const int nb02,
														
 
															-        const int nb03,
														
 
															-        const int nb11,
														
 
															-        const int nb12,
														
 
															-        const int nb13,
														
 
															-        const int ne0,
														
 
															-        const int ne1,
														
 
															-        const int ne2,
														
 
															-        const int ne3) {
														
 
															-#if FP16_AVAILABLE
														
 
															-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
														
 
															-
														
 
															-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
														
 
															-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
														
 
															-
														
 
															-    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
														
 
															-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
														
 
															-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
														
 
															-    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
														
 
															-    const half   * maskh = (const half   *)  mask + ne11*ic0;
														
 
															-
														
 
															-    const int stride_KV  = nb11 / sizeof(half);
														
 
															-    const int stride_KV2 = nb11 / sizeof(half2);
														
 
															-
														
 
															-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
														
 
															-    const half  slopeh = __float2half(slopef);
														
 
															-
														
 
															-    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
														
 
															-    constexpr int nwarps = D / WARP_SIZE;
														
 
															-    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
														
 
															-    __builtin_assume(tid < D);
														
 
															-
														
 
															-    __shared__ half KQ[ncols*D];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        KQ[j*D + tid] = -HALF_MAX_HALF;
														
 
															-    }
														
 
															-    half2 * KQ2 = (half2 *) KQ;
														
 
															-
														
 
															-    half kqmax[ncols];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        kqmax[j] = -HALF_MAX_HALF;
														
 
															-    }
														
 
															-    half kqsum[ncols] = {0.0f};
														
 
															-
														
 
															-    __shared__ half kqmax_shared[ncols][WARP_SIZE];
														
 
															-    __shared__ half kqsum_shared[ncols][WARP_SIZE];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        if (threadIdx.y == 0) {
														
 
															-            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
														
 
															-            kqsum_shared[j][threadIdx.x] = 0.0f;
														
 
															-        }
														
 
															-    }
														
 
															-    __syncthreads();
														
 
															-
														
 
															-    // Convert Q to half2 and store in registers:
														
 
															-    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-#pragma unroll
														
 
															-        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
														
 
															-            const int i = i0 + threadIdx.x;
														
 
															-
														
 
															-            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
														
 
															-            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    half2 VKQ[ncols] = {{0.0f, 0.0f}};
														
 
															-
														
 
															-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
														
 
															-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
														
 
															-        // Calculate KQ tile and keep track of new maximum KQ values:
														
 
															-
														
 
															-        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
														
 
															-        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
														
 
															-        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
														
 
															-        half kqmax_new = kqmax[0];
														
 
															-        half kqmax_new_arr[ncols];
														
 
															-#pragma unroll
														
 
															-        for (int j = 0; j < ncols; ++j) {
														
 
															-            kqmax_new_arr[j] = kqmax[j];
														
 
															-        }
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
														
 
															-            const int i_KQ = i_KQ_0 + threadIdx.y;
														
 
															-
														
 
															-            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            half2 sum2[ncols] = {{0.0f, 0.0f}};
														
 
															-#pragma unroll
														
 
															-            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
														
 
															-                const int k_KQ = k_KQ_0 + threadIdx.x;
														
 
															-
														
 
															-                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
														
 
															-#pragma unroll
														
 
															-                for (int j = 0; j < ncols; ++j) {
														
 
															-                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
														
 
															-                }
														
 
															-            }
														
 
															-
														
 
															-#pragma unroll
														
 
															-            for (int j = 0; j < ncols; ++j) {
														
 
															-                sum2[j] = warp_reduce_sum(sum2[j]);
														
 
															-                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
														
 
															-                sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
														
 
															-
														
 
															-                if (ncols == 1) {
														
 
															-                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
														
 
															-                } else {
														
 
															-                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
														
 
															-                }
														
 
															-
														
 
															-                if (threadIdx.x == 0) {
														
 
															-                    KQ[j*D + i_KQ] = sum;
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int j = 0; j < ncols; ++j) {
														
 
															-            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
														
 
															-
														
 
															-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
														
 
															-            if (threadIdx.x == 0) {
														
 
															-                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        __syncthreads();
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int j = 0; j < ncols; ++j) {
														
 
															-            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
														
 
															-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
														
 
															-
														
 
															-            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
														
 
															-            kqmax[j] = kqmax_new_j;
														
 
															-
														
 
															-            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
														
 
															-            kqsum[j] = kqsum[j]*KQ_max_scale + val;
														
 
															-            KQ[j*D + tid] = val;
														
 
															-
														
 
															-            VKQ[j] *= __half2half2(KQ_max_scale);
														
 
															-        }
														
 
															-
														
 
															-        __syncthreads();
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int k0 = 0; k0 < D; k0 += 2) {
														
 
															-            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            half2 V_k;
														
 
															-            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
														
 
															-            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
														
 
															-#pragma unroll
														
 
															-            for (int j = 0; j < ncols; ++j) {
														
 
															-                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        __syncthreads();
														
 
															-    }
														
 
															-
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        kqsum[j] = warp_reduce_sum(kqsum[j]);
														
 
															-        if (threadIdx.x == 0) {
														
 
															-            kqsum_shared[j][threadIdx.y] = kqsum[j];
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    __syncthreads();
														
 
															-
														
 
															-#pragma unroll
														
 
															-    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
														
 
															-        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
														
 
															-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
														
 
															-
														
 
															-        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
														
 
															-        if (parallel_blocks == 1) {
														
 
															-            dst_val /= kqsum[j_VKQ];
														
 
															-        }
														
 
															-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
														
 
															-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
														
 
															-    }
														
 
															-
														
 
															-    if (parallel_blocks != 1 && tid < ncols) {
														
 
															-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
														
 
															-    }
														
 
															-#else
														
 
															-   NO_DEVICE_CODE;
														
 
															-#endif // FP16_AVAILABLE
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    ggml_tensor * KQV = dst;
														
 
															-    ggml_tensor * Q   = dst->src[0];
														
 
															-
														
 
															-    const int32_t precision = KQV->op_params[2];
														
 
															-    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
														
 
															-
														
 
															-    constexpr int cols_per_block  = 1;
														
 
															-    constexpr int parallel_blocks = 4;
														
 
															-    switch (Q->ne[0]) {
														
 
															-        case  64: {
														
 
															-            constexpr int      D = 64;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        case 128: {
														
 
															-            constexpr int      D = 128;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        case 256: {
														
 
															-            constexpr int      D = 256;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        default:
														
 
															-            GGML_ASSERT(false);
														
 
															-            break;
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-template <int cols_per_block, int parallel_blocks>
														
 
															-void launch_fattn_vec_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * Q = dst->src[0];
														
 
															-    switch (Q->ne[0]) {
														
 
															-        case  64: {
														
 
															-            constexpr int      D = 64;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        case 128: {
														
 
															-            constexpr int      D = 128;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        default: {
														
 
															-            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
														
 
															-        } break;
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * KQV = dst;
														
 
															-    const ggml_tensor * Q   = dst->src[0];
														
 
															-
														
 
															-    const int32_t precision = KQV->op_params[2];
														
 
															-    GGML_ASSERT(precision == GGML_PREC_DEFAULT);
														
 
															-
														
 
															-    if (Q->ne[1] == 1) {
														
 
															-        ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    if (Q->ne[1] == 2) {
														
 
															-        constexpr int cols_per_block  = 2;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    if (Q->ne[1] <= 4) {
														
 
															-        constexpr int cols_per_block  = 4;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    if (Q->ne[1] <= 8) {
														
 
															-        constexpr int cols_per_block  = 8;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    constexpr int cols_per_block  = 8;
														
 
															-    constexpr int parallel_blocks = 1;
														
 
															-    launch_fattn_vec_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-}
														
--- a/llama/ggml-cuda/fattn-vec-f32.cu
+++ b/llama/ggml-cuda/fattn-vec-f32.cu
@@ -1,275 +0,0 @@
 
															-#include "common.cuh"
														
 
															-#include "fattn-common.cuh"
														
 
															-#include "fattn-vec-f32.cuh"
														
 
															-
														
 
															-template<int D, int ncols, int parallel_blocks> // D == head size
														
 
															-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
														
 
															-__launch_bounds__(D, 1)
														
 
															-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
														
 
															-static __global__ void flash_attn_vec_ext_f32(
														
 
															-        const char * __restrict__ Q,
														
 
															-        const char * __restrict__ K,
														
 
															-        const char * __restrict__ V,
														
 
															-        const char * __restrict__ mask,
														
 
															-        float      * __restrict__ dst,
														
 
															-        float2     * __restrict__ dst_meta,
														
 
															-        const float scale,
														
 
															-        const float max_bias,
														
 
															-        const float m0,
														
 
															-        const float m1,
														
 
															-        const uint32_t n_head_log2,
														
 
															-        const int ne00,
														
 
															-        const int ne01,
														
 
															-        const int ne02,
														
 
															-        const int ne03,
														
 
															-        const int ne10,
														
 
															-        const int ne11,
														
 
															-        const int ne12,
														
 
															-        const int ne13,
														
 
															-        const int ne31,
														
 
															-        const int nb31,
														
 
															-        const int nb01,
														
 
															-        const int nb02,
														
 
															-        const int nb03,
														
 
															-        const int nb11,
														
 
															-        const int nb12,
														
 
															-        const int nb13,
														
 
															-        const int ne0,
														
 
															-        const int ne1,
														
 
															-        const int ne2,
														
 
															-        const int ne3) {
														
 
															-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
														
 
															-
														
 
															-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
														
 
															-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
														
 
															-
														
 
															-    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
														
 
															-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
														
 
															-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
														
 
															-    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
														
 
															-    const half   * maskh = (const half   *)  mask + ne11*ic0;
														
 
															-
														
 
															-    const int stride_KV  = nb11 / sizeof(half);
														
 
															-    const int stride_KV2 = nb11 / sizeof(half2);
														
 
															-
														
 
															-    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
														
 
															-
														
 
															-    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
														
 
															-    constexpr int nwarps = D / WARP_SIZE;
														
 
															-    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
														
 
															-    __builtin_assume(tid < D);
														
 
															-
														
 
															-    __shared__ float KQ[ncols*D];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        KQ[j*D + tid] = -FLT_MAX/2.0f;
														
 
															-    }
														
 
															-
														
 
															-    float kqmax[ncols];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        kqmax[j] = -FLT_MAX/2.0f;
														
 
															-    }
														
 
															-    float kqsum[ncols] = {0.0f};
														
 
															-
														
 
															-    __shared__ float kqmax_shared[ncols][WARP_SIZE];
														
 
															-    __shared__ float kqsum_shared[ncols][WARP_SIZE];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        if (threadIdx.y == 0) {
														
 
															-            kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
														
 
															-            kqsum_shared[j][threadIdx.x] = 0.0f;
														
 
															-        }
														
 
															-    }
														
 
															-    __syncthreads();
														
 
															-
														
 
															-    // Convert Q to half2 and store in registers:
														
 
															-    float2 Q_h2[ncols][D/(2*WARP_SIZE)];
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-#pragma unroll
														
 
															-        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
														
 
															-            const int i = i0 + threadIdx.x;
														
 
															-
														
 
															-            Q_h2[j][i0/WARP_SIZE]    = Q_f2[j*(nb01/sizeof(float2)) + i];
														
 
															-            Q_h2[j][i0/WARP_SIZE].x *= scale;
														
 
															-            Q_h2[j][i0/WARP_SIZE].y *= scale;
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    float VKQ[ncols] = {0.0f};
														
 
															-
														
 
															-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
														
 
															-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
														
 
															-        // Calculate KQ tile and keep track of new maximum KQ values:
														
 
															-
														
 
															-        float kqmax_new_arr[ncols];
														
 
															-#pragma unroll
														
 
															-        for (int j = 0; j < ncols; ++j) {
														
 
															-            kqmax_new_arr[j] = kqmax[j];
														
 
															-        }
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
														
 
															-            const int i_KQ = i_KQ_0 + threadIdx.y;
														
 
															-
														
 
															-            if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            float sum[ncols] = {0.0f};
														
 
															-#pragma unroll
														
 
															-            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
														
 
															-                const int k_KQ = k_KQ_0 + threadIdx.x;
														
 
															-
														
 
															-                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
														
 
															-#pragma unroll
														
 
															-                for (int j = 0; j < ncols; ++j) {
														
 
															-                    sum[j] +=  __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
														
 
															-                    sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
														
 
															-                }
														
 
															-            }
														
 
															-
														
 
															-#pragma unroll
														
 
															-            for (int j = 0; j < ncols; ++j) {
														
 
															-                sum[j] = warp_reduce_sum(sum[j]);
														
 
															-                sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
														
 
															-
														
 
															-                kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
														
 
															-
														
 
															-                if (threadIdx.x == 0) {
														
 
															-                    KQ[j*D + i_KQ] = sum[j];
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int j = 0; j < ncols; ++j) {
														
 
															-            float kqmax_new_j = kqmax_new_arr[j];
														
 
															-
														
 
															-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
														
 
															-            if (threadIdx.x == 0) {
														
 
															-                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        __syncthreads();
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int j = 0; j < ncols; ++j) {
														
 
															-            float kqmax_new_j = kqmax_shared[j][threadIdx.x];
														
 
															-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
														
 
															-
														
 
															-            const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
														
 
															-            kqmax[j] = kqmax_new_j;
														
 
															-
														
 
															-            const float val = expf(KQ[j*D + tid] - kqmax[j]);
														
 
															-            kqsum[j] = kqsum[j]*KQ_max_scale + val;
														
 
															-            KQ[j*D + tid] = val;
														
 
															-
														
 
															-            VKQ[j] *= KQ_max_scale;
														
 
															-        }
														
 
															-
														
 
															-        __syncthreads();
														
 
															-
														
 
															-#pragma unroll
														
 
															-        for (int k = 0; k < D; ++k) {
														
 
															-            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
														
 
															-                break;
														
 
															-            }
														
 
															-
														
 
															-            const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
														
 
															-#pragma unroll
														
 
															-            for (int j = 0; j < ncols; ++j) {
														
 
															-                VKQ[j] += V_ki*KQ[j*D + k];
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        __syncthreads();
														
 
															-    }
														
 
															-
														
 
															-#pragma unroll
														
 
															-    for (int j = 0; j < ncols; ++j) {
														
 
															-        kqsum[j] = warp_reduce_sum(kqsum[j]);
														
 
															-        if (threadIdx.x == 0) {
														
 
															-            kqsum_shared[j][threadIdx.y] = kqsum[j];
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    __syncthreads();
														
 
															-
														
 
															-#pragma unroll
														
 
															-    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
														
 
															-        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
														
 
															-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
														
 
															-
														
 
															-        float dst_val = VKQ[j_VKQ];
														
 
															-        if (parallel_blocks == 1) {
														
 
															-            dst_val /= kqsum[j_VKQ];
														
 
															-        }
														
 
															-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
														
 
															-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
														
 
															-    }
														
 
															-
														
 
															-    if (parallel_blocks != 1 && tid < ncols) {
														
 
															-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-template <int cols_per_block, int parallel_blocks>
														
 
															-void launch_fattn_vec_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * Q = dst->src[0];
														
 
															-    switch (Q->ne[0]) {
														
 
															-        case  64: {
														
 
															-            constexpr int      D = 64;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        case 128: {
														
 
															-            constexpr int      D = 128;
														
 
															-            constexpr int nwarps = D/WARP_SIZE;
														
 
															-            fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>;
														
 
															-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
														
 
															-        } break;
														
 
															-        default: {
														
 
															-            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
														
 
															-        } break;
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * Q = dst->src[0];
														
 
															-
														
 
															-    if (Q->ne[1] == 1) {
														
 
															-        constexpr int cols_per_block  = 1;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    if (Q->ne[1] == 2) {
														
 
															-        constexpr int cols_per_block  = 2;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    if (Q->ne[1] <= 4) {
														
 
															-        constexpr int cols_per_block  = 4;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    if (Q->ne[1] <= 8) {
														
 
															-        constexpr int cols_per_block  = 8;
														
 
															-        constexpr int parallel_blocks = 4;
														
 
															-        launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    constexpr int cols_per_block  = 8;
														
 
															-    constexpr int parallel_blocks = 1;
														
 
															-    launch_fattn_vec_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
														
 
															-}
														
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,178 +1,178 @@
 
															-#include "getrows.cuh"
														
 
															-#include "dequantize.cuh"
														
 
															-
														
 
															-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
														
 
															-static __global__ void k_get_rows(
														
 
															-            const void * src0, const int32_t * src1, dst_t * dst,
														
 
															-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
														
 
															-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
														
 
															-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
														
 
															-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
														
 
															-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
														
 
															-
														
 
															-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
														
 
															-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
														
 
															-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
														
 
															-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
														
 
															-
														
 
															-    if (i00 >= ne00) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
														
 
															-
														
 
															-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
														
 
															-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
														
 
															-
														
 
															-    const int ib = i00/qk; // block index
														
 
															-    const int iqs = (i00%qk)/qr; // quant index
														
 
															-    const int iybs = i00 - i00%qk; // dst block start index
														
 
															-    const int y_offset = qr == 1 ? 1 : qk/2;
														
 
															-
														
 
															-    // dequantize
														
 
															-    dfloat2 v;
														
 
															-    dequantize_kernel(src0_row, ib, iqs, v);
														
 
															-
														
 
															-    dst_row[iybs + iqs + 0]        = v.x;
														
 
															-    dst_row[iybs + iqs + y_offset] = v.y;
														
 
															-}
														
 
															-
														
 
															-template<typename src0_t, typename dst_t>
														
 
															-static __global__ void k_get_rows_float(
														
 
															-            const src0_t * src0, const int32_t * src1, dst_t * dst,
														
 
															-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
														
 
															-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
														
 
															-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
														
 
															-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
														
 
															-            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
														
 
															-
														
 
															-    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
														
 
															-    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
														
 
															-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
														
 
															-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
														
 
															-
														
 
															-    if (i00 >= ne00) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
														
 
															-
														
 
															-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
														
 
															-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
														
 
															-
														
 
															-    dst_row[i00] = src0_row[i00];
														
 
															-}
														
 
															-
														
 
															-template<int qk, int qr, dequantize_kernel_t dq>
														
 
															-static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
														
 
															-                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
														
 
															-
														
 
															-    GGML_TENSOR_BINARY_OP_LOCALS
														
 
															-
														
 
															-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
														
 
															-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
														
 
															-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
														
 
															-
														
 
															-    // strides in elements
														
 
															-    //const size_t s0 = nb0 / ggml_element_size(dst);
														
 
															-    const size_t s1 = nb1 / ggml_element_size(dst);
														
 
															-    const size_t s2 = nb2 / ggml_element_size(dst);
														
 
															-    const size_t s3 = nb3 / ggml_element_size(dst);
														
 
															-
														
 
															-    const size_t s10 = nb10 / ggml_element_size(src1);
														
 
															-    const size_t s11 = nb11 / ggml_element_size(src1);
														
 
															-    const size_t s12 = nb12 / ggml_element_size(src1);
														
 
															-    //const size_t s13 = nb13 / ggml_element_size(src1);
														
 
															-
														
 
															-    GGML_ASSERT(ne00 % 2 == 0);
														
 
															-
														
 
															-    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
														
 
															-            src0_dd, src1_dd, dst_dd,
														
 
															-            ne00, /*ne01, ne02, ne03,*/
														
 
															-            /*ne10, ne11,*/ ne12, /*ne13,*/
														
 
															-            /* s0,*/ s1, s2, s3,
														
 
															-            /* nb00,*/ nb01, nb02, nb03,
														
 
															-            s10, s11, s12/*, s13*/);
														
 
															-
														
 
															-    GGML_UNUSED(dst);
														
 
															-}
														
 
															-
														
 
															-template<typename src0_t>
														
 
															-static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
														
 
															-                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
														
 
															-
														
 
															-    GGML_TENSOR_BINARY_OP_LOCALS
														
 
															-
														
 
															-    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
														
 
															-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
														
 
															-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
														
 
															-
														
 
															-    // strides in elements
														
 
															-    //const size_t s0 = nb0 / ggml_element_size(dst);
														
 
															-    const size_t s1 = nb1 / ggml_element_size(dst);
														
 
															-    const size_t s2 = nb2 / ggml_element_size(dst);
														
 
															-    const size_t s3 = nb3 / ggml_element_size(dst);
														
 
															-
														
 
															-    const size_t s10 = nb10 / ggml_element_size(src1);
														
 
															-    const size_t s11 = nb11 / ggml_element_size(src1);
														
 
															-    const size_t s12 = nb12 / ggml_element_size(src1);
														
 
															-    //const size_t s13 = nb13 / ggml_element_size(src1);
														
 
															-
														
 
															-    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
														
 
															-            src0_dd, src1_dd, dst_dd,
														
 
															-            ne00, /*ne01, ne02, ne03,*/
														
 
															-            /*ne10, ne11,*/ ne12, /*ne13,*/
														
 
															-            /* s0,*/ s1, s2, s3,
														
 
															-            /* nb00,*/ nb01, nb02, nb03,
														
 
															-            s10, s11, s12/*, s13*/);
														
 
															-
														
 
															-    GGML_UNUSED(dst);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const ggml_tensor * src1 = dst->src[1];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    const float * src1_d = (const float *)src1->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-
														
 
															-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
														
 
															-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-
														
 
															-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
														
 
															-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
														
 
															-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
														
 
															-
														
 
															-    const int32_t * src1_i32 = (const int32_t *) src1_d;
														
 
															-
														
 
															-    switch (src0->type) {
														
 
															-        case GGML_TYPE_F16:
														
 
															-            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        case GGML_TYPE_F32:
														
 
															-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        case GGML_TYPE_Q4_0:
														
 
															-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        case GGML_TYPE_Q4_1:
														
 
															-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        case GGML_TYPE_Q5_0:
														
 
															-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        case GGML_TYPE_Q5_1:
														
 
															-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        case GGML_TYPE_Q8_0:
														
 
															-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
														
 
															-            break;
														
 
															-        default:
														
 
															-            // TODO: k-quants
														
 
															-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
														
 
															-            GGML_ASSERT(false);
														
 
															-            break;
														
 
															-    }
														
 
															-}
														
 
															+#include "getrows.cuh"

														
 
															+#include "dequantize.cuh"

														
 
															+

														
 
															+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>

														
 
															+static __global__ void k_get_rows(

														
 
															+            const void * src0, const int32_t * src1, dst_t * dst,

														
 
															+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/

														
 
															+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/

														
 
															+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,

														
 
															+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,

														
 
															+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

														
 
															+

														
 
															+    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;

														
 
															+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;

														
 
															+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;

														
 
															+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

														
 
															+

														
 
															+    if (i00 >= ne00) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

														
 
															+

														
 
															+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;

														
 
															+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;

														
 
															+

														
 
															+    const int ib = i00/qk; // block index

														
 
															+    const int iqs = (i00%qk)/qr; // quant index

														
 
															+    const int iybs = i00 - i00%qk; // dst block start index

														
 
															+    const int y_offset = qr == 1 ? 1 : qk/2;

														
 
															+

														
 
															+    // dequantize

														
 
															+    dfloat2 v;

														
 
															+    dequantize_kernel(src0_row, ib, iqs, v);

														
 
															+

														
 
															+    dst_row[iybs + iqs + 0]        = v.x;

														
 
															+    dst_row[iybs + iqs + y_offset] = v.y;

														
 
															+}

														
 
															+

														
 
															+template<typename src0_t, typename dst_t>

														
 
															+static __global__ void k_get_rows_float(

														
 
															+            const src0_t * src0, const int32_t * src1, dst_t * dst,

														
 
															+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/

														
 
															+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/

														
 
															+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,

														
 
															+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,

														
 
															+            size_t s10, size_t s11, size_t s12/*, size_t s13*/) {

														
 
															+

														
 
															+    const int i00 = blockIdx.x*blockDim.x + threadIdx.x;

														
 
															+    const int i10 = blockDim.y*blockIdx.y + threadIdx.y;

														
 
															+    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;

														
 
															+    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;

														
 
															+

														
 
															+    if (i00 >= ne00) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];

														
 
															+

														
 
															+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;

														
 
															+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);

														
 
															+

														
 
															+    dst_row[i00] = src0_row[i00];

														
 
															+}

														
 
															+

														
 
															+template<int qk, int qr, dequantize_kernel_t dq>

														
 
															+static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

														
 
															+                            const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

														
 
															+

														
 
															+    GGML_TENSOR_BINARY_OP_LOCALS

														
 
															+

														
 
															+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);

														
 
															+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);

														
 
															+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

														
 
															+

														
 
															+    // strides in elements

														
 
															+    //const size_t s0 = nb0 / ggml_element_size(dst);

														
 
															+    const size_t s1 = nb1 / ggml_element_size(dst);

														
 
															+    const size_t s2 = nb2 / ggml_element_size(dst);

														
 
															+    const size_t s3 = nb3 / ggml_element_size(dst);

														
 
															+

														
 
															+    const size_t s10 = nb10 / ggml_element_size(src1);

														
 
															+    const size_t s11 = nb11 / ggml_element_size(src1);

														
 
															+    const size_t s12 = nb12 / ggml_element_size(src1);

														
 
															+    //const size_t s13 = nb13 / ggml_element_size(src1);

														
 
															+

														
 
															+    GGML_ASSERT(ne00 % 2 == 0);

														
 
															+

														
 
															+    k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(

														
 
															+            src0_dd, src1_dd, dst_dd,

														
 
															+            ne00, /*ne01, ne02, ne03,*/

														
 
															+            /*ne10, ne11,*/ ne12, /*ne13,*/

														
 
															+            /* s0,*/ s1, s2, s3,

														
 
															+            /* nb00,*/ nb01, nb02, nb03,

														
 
															+            s10, s11, s12/*, s13*/);

														
 
															+

														
 
															+    GGML_UNUSED(dst);

														
 
															+}

														
 
															+

														
 
															+template<typename src0_t>

														
 
															+static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,

														
 
															+                                const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {

														
 
															+

														
 
															+    GGML_TENSOR_BINARY_OP_LOCALS

														
 
															+

														
 
															+    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);

														
 
															+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;

														
 
															+    const dim3 block_nums(block_num_x, ne10, ne11*ne12);

														
 
															+

														
 
															+    // strides in elements

														
 
															+    //const size_t s0 = nb0 / ggml_element_size(dst);

														
 
															+    const size_t s1 = nb1 / ggml_element_size(dst);

														
 
															+    const size_t s2 = nb2 / ggml_element_size(dst);

														
 
															+    const size_t s3 = nb3 / ggml_element_size(dst);

														
 
															+

														
 
															+    const size_t s10 = nb10 / ggml_element_size(src1);

														
 
															+    const size_t s11 = nb11 / ggml_element_size(src1);

														
 
															+    const size_t s12 = nb12 / ggml_element_size(src1);

														
 
															+    //const size_t s13 = nb13 / ggml_element_size(src1);

														
 
															+

														
 
															+    k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(

														
 
															+            src0_dd, src1_dd, dst_dd,

														
 
															+            ne00, /*ne01, ne02, ne03,*/

														
 
															+            /*ne10, ne11,*/ ne12, /*ne13,*/

														
 
															+            /* s0,*/ s1, s2, s3,

														
 
															+            /* nb00,*/ nb01, nb02, nb03,

														
 
															+            s10, s11, s12/*, s13*/);

														
 
															+

														
 
															+    GGML_UNUSED(dst);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const ggml_tensor * src1 = dst->src[1];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    const float * src1_d = (const float *)src1->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+

														
 
															+    GGML_ASSERT(src1->type == GGML_TYPE_I32);

														
 
															+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

														
 
															+

														
 
															+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));

														
 
															+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));

														
 
															+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));

														
 
															+

														
 
															+    const int32_t * src1_i32 = (const int32_t *) src1_d;

														
 
															+

														
 
															+    switch (src0->type) {

														
 
															+        case GGML_TYPE_F16:

														
 
															+            get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        case GGML_TYPE_F32:

														
 
															+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        case GGML_TYPE_Q4_0:

														
 
															+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        case GGML_TYPE_Q4_1:

														
 
															+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        case GGML_TYPE_Q5_0:

														
 
															+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        case GGML_TYPE_Q5_1:

														
 
															+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        case GGML_TYPE_Q8_0:

														
 
															+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);

														
 
															+            break;

														
 
															+        default:

														
 
															+            // TODO: k-quants

														
 
															+            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));

														
 
															+            GGML_ASSERT(false);

														
 
															+            break;

														
 
															+    }

														
 
															+}

														
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_GET_ROWS_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_GET_ROWS_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,104 +1,104 @@
 
															-#include "im2col.cuh"
														
 
															-
														
 
															-template <typename T>
														
 
															-static  __global__ void im2col_kernel(
														
 
															-        const float * x, T * dst, int64_t batch_offset,
														
 
															-        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
														
 
															-        int s0, int s1, int p0, int p1, int d0, int d1) {
														
 
															-    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
														
 
															-    if (i >= pelements) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int64_t  ksize = OW * (KH > 1 ? KW : 1);
														
 
															-    const int64_t  kx = i / ksize;
														
 
															-    const int64_t  kd = kx * ksize;
														
 
															-    const int64_t  ky = (i - kd) / OW;
														
 
															-    const int64_t  ix = i % OW;
														
 
															-
														
 
															-    const int64_t  oh = blockIdx.y;
														
 
															-    const int64_t  batch = blockIdx.z / IC;
														
 
															-    const int64_t  ic = blockIdx.z % IC;
														
 
															-
														
 
															-    const int64_t iiw = ix * s0 + kx * d0 - p0;
														
 
															-    const int64_t iih = oh * s1 + ky * d1 - p1;
														
 
															-
														
 
															-    const int64_t offset_dst =
														
 
															-        ((batch * OH + oh) * OW + ix) * CHW +
														
 
															-        (ic * (KW * KH) + ky * KW + kx);
														
 
															-
														
 
															-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
														
 
															-        dst[offset_dst] = 0.0f;
														
 
															-    } else {
														
 
															-        const int64_t offset_src = ic * offset_delta + batch * batch_offset;
														
 
															-        dst[offset_dst] = x[offset_src + iih * IW + iiw];
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-template <typename T>
														
 
															-static void im2col_cuda(const float * x, T* dst,
														
 
															-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
														
 
															-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
														
 
															-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
														
 
															-    const int parallel_elements = OW * KW * KH;
														
 
															-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
														
 
															-    dim3 block_nums(num_blocks, OH, batch * IC);
														
 
															-    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
														
 
															-}
														
 
															-
														
 
															-static void im2col_cuda_f16(const float * x, half * dst,
														
 
															-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
														
 
															-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
														
 
															-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
														
 
															-
														
 
															-    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
														
 
															-}
														
 
															-
														
 
															-static void im2col_cuda_f32(const float * x, float * dst,
														
 
															-    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
														
 
															-    int64_t batch, int64_t batch_offset, int64_t offset_delta,
														
 
															-    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
														
 
															-
														
 
															-    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const ggml_tensor * src1 = dst->src[1];
														
 
															-    const float * src1_d = (const float *)src1->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
														
 
															-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
														
 
															-
														
 
															-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
														
 
															-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
														
 
															-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
														
 
															-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
														
 
															-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
														
 
															-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
														
 
															-
														
 
															-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
														
 
															-
														
 
															-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
														
 
															-    const int64_t IH = is_2D ? src1->ne[1] : 1;
														
 
															-    const int64_t IW =         src1->ne[0];
														
 
															-
														
 
															-    const int64_t KH = is_2D ? src0->ne[1] : 1;
														
 
															-    const int64_t KW =         src0->ne[0];
														
 
															-
														
 
															-    const int64_t OH = is_2D ? dst->ne[2] : 1;
														
 
															-    const int64_t OW =         dst->ne[1];
														
 
															-
														
 
															-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
														
 
															-    const int64_t batch = src1->ne[3];
														
 
															-    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
														
 
															-
														
 
															-    if(dst->type == GGML_TYPE_F16) {
														
 
															-        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
														
 
															-    } else {
														
 
															-        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
														
 
															-    }
														
 
															-}
														
 
															+#include "im2col.cuh"

														
 
															+

														
 
															+template <typename T>

														
 
															+static  __global__ void im2col_kernel(

														
 
															+        const float * x, T * dst, int64_t batch_offset,

														
 
															+        int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,

														
 
															+        int s0, int s1, int p0, int p1, int d0, int d1) {

														
 
															+    const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;

														
 
															+    if (i >= pelements) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int64_t  ksize = OW * (KH > 1 ? KW : 1);

														
 
															+    const int64_t  kx = i / ksize;

														
 
															+    const int64_t  kd = kx * ksize;

														
 
															+    const int64_t  ky = (i - kd) / OW;

														
 
															+    const int64_t  ix = i % OW;

														
 
															+

														
 
															+    const int64_t  oh = blockIdx.y;

														
 
															+    const int64_t  batch = blockIdx.z / IC;

														
 
															+    const int64_t  ic = blockIdx.z % IC;

														
 
															+

														
 
															+    const int64_t iiw = ix * s0 + kx * d0 - p0;

														
 
															+    const int64_t iih = oh * s1 + ky * d1 - p1;

														
 
															+

														
 
															+    const int64_t offset_dst =

														
 
															+        ((batch * OH + oh) * OW + ix) * CHW +

														
 
															+        (ic * (KW * KH) + ky * KW + kx);

														
 
															+

														
 
															+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {

														
 
															+        dst[offset_dst] = 0.0f;

														
 
															+    } else {

														
 
															+        const int64_t offset_src = ic * offset_delta + batch * batch_offset;

														
 
															+        dst[offset_dst] = x[offset_src + iih * IW + iiw];

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+template <typename T>

														
 
															+static void im2col_cuda(const float * x, T* dst,

														
 
															+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

														
 
															+    int64_t batch, int64_t batch_offset, int64_t offset_delta,

														
 
															+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

														
 
															+    const int parallel_elements = OW * KW * KH;

														
 
															+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;

														
 
															+    dim3 block_nums(num_blocks, OH, batch * IC);

														
 
															+    im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);

														
 
															+}

														
 
															+

														
 
															+static void im2col_cuda_f16(const float * x, half * dst,

														
 
															+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

														
 
															+    int64_t batch, int64_t batch_offset, int64_t offset_delta,

														
 
															+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

														
 
															+

														
 
															+    im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);

														
 
															+}

														
 
															+

														
 
															+static void im2col_cuda_f32(const float * x, float * dst,

														
 
															+    int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,

														
 
															+    int64_t batch, int64_t batch_offset, int64_t offset_delta,

														
 
															+    int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {

														
 
															+

														
 
															+    im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const ggml_tensor * src1 = dst->src[1];

														
 
															+    const float * src1_d = (const float *)src1->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F16);

														
 
															+    GGML_ASSERT(src1->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);

														
 
															+

														
 
															+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];

														
 
															+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];

														
 
															+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];

														
 
															+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];

														
 
															+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];

														
 
															+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];

														
 
															+

														
 
															+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;

														
 
															+

														
 
															+    const int64_t IC = src1->ne[is_2D ? 2 : 1];

														
 
															+    const int64_t IH = is_2D ? src1->ne[1] : 1;

														
 
															+    const int64_t IW =         src1->ne[0];

														
 
															+

														
 
															+    const int64_t KH = is_2D ? src0->ne[1] : 1;

														
 
															+    const int64_t KW =         src0->ne[0];

														
 
															+

														
 
															+    const int64_t OH = is_2D ? dst->ne[2] : 1;

														
 
															+    const int64_t OW =         dst->ne[1];

														
 
															+

														
 
															+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32

														
 
															+    const int64_t batch = src1->ne[3];

														
 
															+    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32

														
 
															+

														
 
															+    if(dst->type == GGML_TYPE_F16) {

														
 
															+        im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);

														
 
															+    } else {

														
 
															+        im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);

														
 
															+    }

														
 
															+}

														
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_IM2COL_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_IM2COL_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -11,6 +11,7 @@ void ggml_cuda_op_mul_mat_q(
 
															     const int64_t nb01 = src0->nb[1];
														
 
															     const int64_t ne10 = src1->ne[0];
														
 
															+    const int64_t ne11 = src1->ne[1];
														
 
															     GGML_ASSERT(ne10 % QK8_1 == 0);
														
 
															     const int64_t ne0 = dst->ne[0];
														
@@ -25,7 +26,7 @@ void ggml_cuda_op_mul_mat_q(
 
															     // nrows_dst == nrows of the matrix that the kernel writes into
														
 
															     const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
														
 
															-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
														
 
															+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
														
 
															     switch (src0->type) {
														
 
															         case GGML_TYPE_Q4_0:
														
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,15 +1,26 @@
 
															+#pragma once
														
 
															+
														
 
															 #include "common.cuh"
														
 
															 #include "vecdotq.cuh"
														
 
															 #include <climits>
														
 
															 #include <cstdint>
														
 
															+#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
														
 
															+
														
 
															 typedef void (*load_tiles_mmq_t)(
														
 
															     const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
														
 
															     int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
														
 
															 typedef void (*vec_dot_mmq_t)(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0);
														
 
															+
														
 
															+struct block_q8_1_mmq {
														
 
															+    half2  ds[4];
														
 
															+    int8_t qs[4*QK8_1];
														
 
															+};
														
 
															+static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
														
 
															+static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
														
 
															 struct tile_x_sizes {
														
 
															     int ql;
														
@@ -132,10 +143,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
														
 
															+    const float * x_dmf = (const float *) x_dm;
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const half2 * y_ds  = (const half2 *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -145,19 +160,18 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
 
															             const int i = i0 + threadIdx.x;
														
 
															             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
														
 
															-            const float * x_dmf = (const float *) x_dm;
														
 
															             int u[2*VDR_Q4_0_Q8_1_MMQ];
														
 
															 #pragma unroll
														
 
															             for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
														
 
															-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
														
 
															-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
														
 
															+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
														
 
															+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_0) % WARP_SIZE];
														
 
															             }
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
														
 
															-                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
														
 
															-                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															+                (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
														
 
															+                y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -203,10 +217,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
														
 
															+    const int   * y_qs = (const int   *) y + 4;
														
 
															+    const half2 * y_ds = (const half2 *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -221,13 +238,13 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
 
															 #pragma unroll
														
 
															             for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
														
 
															-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
														
 
															-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
														
 
															+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
														
 
															+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_1) % WARP_SIZE];
														
 
															             }
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
														
 
															-                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
														
 
															-                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															+                (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
														
 
															+                y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -293,10 +310,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
														
 
															+    const float * x_dmf = (const float *) x_dm;
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const float * y_df  = (const float *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -306,20 +327,18 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
 
															             const int i = i0 + threadIdx.x;
														
 
															             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
														
 
															-            const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
														
 
															-            const float * x_dmf = (const float *) x_dm;
														
 
															-            const float * y_df  = (const float *) y_ds;
														
 
															+            const int index_bx = i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
														
 
															             int u[2*VDR_Q5_0_Q8_1_MMQ];
														
 
															 #pragma unroll
														
 
															             for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
														
 
															-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
														
 
															-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
														
 
															+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
														
 
															+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_0) % WARP_SIZE];
														
 
															             }
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
														
 
															-                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															+                (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dmf[index_bx], y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -383,10 +402,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const half2 * y_ds  = (const half2 *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -396,18 +418,18 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
 
															             const int i = i0 + threadIdx.x;
														
 
															             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
														
 
															-            const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
														
 
															+            const int index_bx = i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1;
														
 
															             int u[2*VDR_Q5_1_Q8_1_MMQ];
														
 
															 #pragma unroll
														
 
															             for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
														
 
															-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
														
 
															-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
														
 
															+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
														
 
															+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_1) % WARP_SIZE];
														
 
															             }
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
														
 
															-                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															+                (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dm[index_bx], y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -455,10 +477,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
														
 
															+    const float * x_dmf = (const float *) x_dm;
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const float * y_df  = (const float *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -467,12 +493,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
 
															         for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
														
 
															             const int i = i0 + threadIdx.x;
														
 
															-            const float * x_dmf = (const float *) x_dm;
														
 
															-            const float * y_df  = (const float *) y_ds;
														
 
															-
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
														
 
															-                (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
														
 
															-                y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
														
 
															+                (&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0], x_dmf[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
														
 
															+                y_df[j*MMQ_TILE_Y_K + k0/QI8_1]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -531,10 +554,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh);
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const float * y_df  = (const float *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -545,11 +571,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
 
															             const int kbx = k0 / QI2_K;
														
 
															             const int ky  = (k0 % QI2_K) * QR2_K;
														
 
															-            const float * y_df = (const float *) y_ds;
														
 
															             int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
														
 
															-            const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
														
 
															+            const int kqsx = i*(WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
														
 
															             const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
														
 
															 #pragma unroll
														
@@ -557,11 +582,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
 
															                 v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
														
 
															             }
														
 
															-            const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
														
 
															+            const uint8_t * scales = ((const uint8_t *) &x_sc[i*(WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
														
 
															-            const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
														
 
															-                v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
														
 
															+                v, &y_qs[j*MMQ_TILE_Y_K + (QR2_K*k0) % WARP_SIZE], scales,
														
 
															+                x_dm[i*(WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[j*MMQ_TILE_Y_K + ((QR2_K*k0) % WARP_SIZE)/QI8_1]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -646,7 +671,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															+
														
 
															+    const float * x_dmf = (const float *) x_dm;
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const float * y_df  = (const float *) y;
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
@@ -658,8 +687,6 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
 
															             const int kbx  = k0 / QI3_K;
														
 
															             const int ky  = (k0 % QI3_K) * QR3_K;
														
 
															-            const float * x_dmf = (const float *) x_dm;
														
 
															-            const float * y_df  = (const float *) y_ds;
														
 
															             const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
														
@@ -667,19 +694,19 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
 
															 #pragma unroll
														
 
															             for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
														
 
															-                const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
														
 
															+                const int kqsx = i*(WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
														
 
															                 const int shift = 2 * ((ky % 32) / 8);
														
 
															                 const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
														
 
															-                const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
														
 
															+                const int vh = x_qh[i*(WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
														
 
															                 const int vlh = (vh << 2) & 0x04040404;
														
 
															                 v[l] = __vsubss4(vll, vlh);
														
 
															             }
														
 
															-            const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
														
 
															-                v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
														
 
															+                v, &y_qs[j*MMQ_TILE_Y_K + (k0*QR3_K) % WARP_SIZE], scales,
														
 
															+                x_dmf[i*(WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[j*MMQ_TILE_Y_K + ((k0*QR3_K) % WARP_SIZE)/QI8_1]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -746,10 +773,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh);
														
 
															+    const int   * y_qs = (const int   *) y + 4;
														
 
															+    const half2 * y_ds = (const half2 *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -760,9 +790,9 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
 
															             const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
														
 
															-            const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
														
 
															-                &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
														
 
															+                &x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + (QR4_K*k0) % WARP_SIZE], sc, sc+8,
														
 
															+                x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + ((QR4_K*k0) % WARP_SIZE)/QI8_1]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -842,10 +872,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh);
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const half2 * y_ds  = (const half2 *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -856,10 +889,9 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
 
															             const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
														
 
															-            const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k0;
														
 
															-            const int index_y = j * WARP_SIZE             + (QR5_K*k0) % WARP_SIZE;
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
														
 
															-                &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
														
 
															+                &x_ql[i*(QR5_K*WARP_SIZE + 1) + QR5_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR5_K*k0) % WARP_SIZE], sc, sc+8,
														
 
															+                x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + ((QR5_K*k0) % WARP_SIZE)/QI8_1]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -932,10 +964,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
															 template <int mmq_x, int mmq_y, int nwarps>
														
 
															 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
														
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
														
 
															+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
														
 
															     GGML_UNUSED(x_qh);
														
 
															+    const float * x_dmf = (const float *) x_dm;
														
 
															+    const int   * y_qs  = (const int   *) y + 4;
														
 
															+    const float * y_df  = (const float *) y;
														
 
															+
														
 
															 #pragma unroll
														
 
															     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
														
 
															         const int j = j0 + threadIdx.y;
														
@@ -944,15 +980,11 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
 
															         for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
														
 
															             const int i = i0 + threadIdx.x;
														
 
															-            const float * x_dmf = (const float *) x_dm;
														
 
															-            const float * y_df  = (const float *) y_ds;
														
 
															-
														
 
															             const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
														
 
															-            const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k0;
														
 
															-            const int index_y = j * WARP_SIZE             + (QR6_K*k0) % WARP_SIZE;
														
 
															             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
														
 
															-                &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
														
 
															+                &x_ql[i*(QR6_K*WARP_SIZE + 1) + QR6_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR6_K*k0) % WARP_SIZE], sc,
														
 
															+                x_dmf[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + ((QR6_K*k0) % WARP_SIZE)/QI8_1]);
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -964,7 +996,6 @@ struct mmq_type_traits;
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
														
 
															-    static constexpr bool             need_sum   = true;
														
 
															     static constexpr int              vdr        = VDR_Q4_0_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -972,7 +1003,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
														
 
															-    static constexpr bool             need_sum   = true;
														
 
															     static constexpr int              vdr        = VDR_Q4_1_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -980,7 +1010,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
														
 
															-    static constexpr bool             need_sum   = false;
														
 
															     static constexpr int              vdr        = VDR_Q5_0_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -988,7 +1017,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
														
 
															-    static constexpr bool             need_sum   = true;
														
 
															     static constexpr int              vdr        = VDR_Q5_1_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -996,7 +1024,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
														
 
															-    static constexpr bool             need_sum   = false;
														
 
															     static constexpr int              vdr        = VDR_Q8_0_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -1004,7 +1031,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
														
 
															-    static constexpr bool             need_sum   = false;
														
 
															     static constexpr int              vdr        = VDR_Q2_K_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -1012,7 +1038,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
														
 
															-    static constexpr bool             need_sum   = false;
														
 
															     static constexpr int              vdr        = VDR_Q3_K_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -1020,7 +1045,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
														
 
															-    static constexpr bool             need_sum   = true;
														
 
															     static constexpr int              vdr        = VDR_Q4_K_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -1028,7 +1052,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
														
 
															-    static constexpr bool             need_sum   = true;
														
 
															     static constexpr int              vdr        = VDR_Q5_K_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
@@ -1036,12 +1059,36 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
 
															 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
														
 
															 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
														
 
															-    static constexpr bool             need_sum   = false;
														
 
															     static constexpr int              vdr        = VDR_Q6_K_Q8_1_MMQ;
														
 
															     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
														
 
															     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
														
 
															 };
														
 
															+static int mmq_need_sum(const ggml_type type_x) {
														
 
															+    switch (type_x) {
														
 
															+        case GGML_TYPE_Q4_0:
														
 
															+        case GGML_TYPE_Q4_1:
														
 
															+            return true;
														
 
															+        case GGML_TYPE_Q5_0:
														
 
															+            return false;
														
 
															+        case GGML_TYPE_Q5_1:
														
 
															+            return true;
														
 
															+        case GGML_TYPE_Q8_0:
														
 
															+        case GGML_TYPE_Q2_K:
														
 
															+        case GGML_TYPE_Q3_K:
														
 
															+            return false;
														
 
															+        case GGML_TYPE_Q4_K:
														
 
															+        case GGML_TYPE_Q5_K:
														
 
															+            return true;
														
 
															+        case GGML_TYPE_Q6_K:
														
 
															+            return false;
														
 
															+        default:
														
 
															+            GGML_ASSERT(false);
														
 
															+            break;
														
 
															+    }
														
 
															+    return false;
														
 
															+}
														
 
															+
														
 
															 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
														
 
															 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
														
 
															 #if defined(RDNA3) || defined(RDNA2)
														
@@ -1056,7 +1103,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 
															 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
														
 
															 static __global__ void mul_mat_q(
														
 
															     const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
														
 
															-    const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
														
 
															+    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
														
 
															     // Skip unused template specializations for faster compilation:
														
 
															     if (mmq_x > get_mmq_x_max_device()) {
														
@@ -1068,7 +1115,6 @@ static __global__ void mul_mat_q(
 
															     constexpr int              qr         = ggml_cuda_type_traits<type>::qr;
														
 
															     constexpr int              qi         = ggml_cuda_type_traits<type>::qi;
														
 
															     constexpr int              mmq_y      = get_mmq_y_device(mmq_x);
														
 
															-    constexpr bool             need_sum   = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
														
 
															     constexpr int              vdr        = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
														
 
															     constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
														
 
															     constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
														
@@ -1080,62 +1126,38 @@ static __global__ void mul_mat_q(
 
															     half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
														
 
															     int   * tile_x_qh = (int   *) (tile_x_dm + txs.dm);
														
 
															     int   * tile_x_sc = (int   *) (tile_x_qh + txs.qh);
														
 
															-    int   * tile_y_qs = (int   *) (tile_x_sc + txs.sc);          // [mmq_x * WARP_SIZE]
														
 
															-    half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
														
 
															-
														
 
															-    const block_q8_1 * y = (const block_q8_1 *) yc;
														
 
															+    int   * tile_y    = (int   *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)]
														
 
															     const int blocks_per_row_x = ne00 / qk;
														
 
															-    const int blocks_per_col_y = ne10 / QK8_1;
														
 
															     const int blocks_per_warp = WARP_SIZE / qi;
														
 
															     const int & ne1 = ne11;
														
 
															     const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
														
 
															+    const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
														
 
															+
														
 
															     float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
														
 
															     for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
														
 
															-        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
														
 
															+        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01);
														
 
															 #pragma unroll
														
 
															         for (int kr = 0; kr < qr; ++kr) {
														
 
															-            const int kqs = kr*WARP_SIZE + threadIdx.x;
														
 
															-            const int kbxd = kqs / QI8_1;
														
 
															-
														
 
															+            const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + kr*sizeof(block_q8_1_mmq)/sizeof(int));
														
 
															 #pragma unroll
														
 
															-            for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
														
 
															-                const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
														
 
															-
														
 
															-                const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
														
 
															+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
														
 
															+                int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
														
 
															-                const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
														
 
															-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
														
 
															-            }
														
 
															-
														
 
															-#pragma unroll
														
 
															-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
														
 
															-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
														
 
															-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
														
 
															-                const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
														
 
															-
														
 
															-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
														
 
															-                const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
														
 
															-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
														
 
															-                if (need_sum) {
														
 
															-                    *dsi_dst = *dsi_src;
														
 
															-                } else {
														
 
															-                    float * dfi_dst = (float *) dsi_dst;
														
 
															-                    *dfi_dst = __low2float(*dsi_src);
														
 
															-                }
														
 
															+                tile_y[l] = by0[l];
														
 
															             }
														
 
															             __syncthreads();
														
 
															 // #pragma unroll // unrolling this loop causes too much register pressure
														
 
															             for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
														
 
															-                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
														
 
															+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y, sum, k0);
														
 
															             }
														
 
															             __syncthreads();
														
@@ -1165,8 +1187,8 @@ static __global__ void mul_mat_q(
 
															 struct mmq_args {
														
 
															     const char * x; const char * y; float * dst;
														
 
															-    int64_t ne00; int64_t ne01; int64_t stride00;
														
 
															-    int64_t ne10; int64_t ne11;
														
 
															+    int64_t ne00; int64_t ne01; int64_t stride01;
														
 
															+    int64_t ne10; int64_t ne11; int64_t stride11;
														
 
															     int64_t ne0;
														
 
															 };
														
@@ -1184,7 +1206,7 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
 
															     const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
														
 
															     const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
														
 
															     const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
														
 
															-    const int shmem = shmem_x + shmem_y;
														
 
															+    const int shmem = shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int));
														
 
															 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
														
 
															     static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
														
@@ -1198,11 +1220,11 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
 
															     if (args.ne01 % mmq_y == 0) {
														
 
															         const bool need_check = false;
														
 
															         mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
														
 
															-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
														
 
															+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
														
 
															     } else {
														
 
															         const bool need_check = true;
														
 
															         mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
														
 
															-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
														
 
															+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
														
 
															     }
														
 
															 }
														
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,7 +1,7 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-void ggml_cuda_op_mul_mat_vec_q(
														
 
															-    ggml_backend_cuda_context & ctx,
														
 
															-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
														
 
															-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
														
 
															-    const int64_t src1_padded_row_size, cudaStream_t stream);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+void ggml_cuda_op_mul_mat_vec_q(

														
 
															+    ggml_backend_cuda_context & ctx,

														
 
															+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

														
 
															+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,

														
 
															+    const int64_t src1_padded_row_size, cudaStream_t stream);

														
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,7 +1,7 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															-
														
 
															-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															-
														
 
															-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
 
															+

														
 
															+void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
 
															+

														
 
															+void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,49 +1,49 @@
 
															-#include "pad.cuh"
														
 
															-
														
 
															-static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
														
 
															-    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03
														
 
															-    // blockIdx.y: idx of ne1
														
 
															-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
														
 
															-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
														
 
															-    if (nidx >= ne0) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    // operation
														
 
															-    int offset_dst =
														
 
															-        nidx +
														
 
															-        blockIdx.y * ne0 +
														
 
															-        blockIdx.z * ne0 * gridDim.y;
														
 
															-    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {
														
 
															-        int offset_src =
														
 
															-            nidx +
														
 
															-            blockIdx.y * ne00 +
														
 
															-            blockIdx.z * ne00 * ne01;
														
 
															-        dst[offset_dst] = x[offset_src];
														
 
															-    } else {
														
 
															-        dst[offset_dst] = 0.0f;
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-static void pad_f32_cuda(const float * x, float * dst,
														
 
															-    const int ne00, const int ne01, const int ne02, const int ne03,
														
 
															-    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
														
 
															-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
														
 
															-    dim3 gridDim(num_blocks, ne1, ne2*ne3);
														
 
															-    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
														
 
															-
														
 
															-    pad_f32_cuda(src0_d, dst_d,
														
 
															-        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
														
 
															-        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
														
 
															-}
														
 
															+#include "pad.cuh"

														
 
															+

														
 
															+static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {

														
 
															+    // blockIdx.z: idx of ne2*ne3, aka ne02*ne03

														
 
															+    // blockIdx.y: idx of ne1

														
 
															+    // blockIDx.x: idx of ne0 / BLOCK_SIZE

														
 
															+    int nidx = threadIdx.x + blockIdx.x * blockDim.x;

														
 
															+    if (nidx >= ne0) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    // operation

														
 
															+    int offset_dst =

														
 
															+        nidx +

														
 
															+        blockIdx.y * ne0 +

														
 
															+        blockIdx.z * ne0 * gridDim.y;

														
 
															+    if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) {

														
 
															+        int offset_src =

														
 
															+            nidx +

														
 
															+            blockIdx.y * ne00 +

														
 
															+            blockIdx.z * ne00 * ne01;

														
 
															+        dst[offset_dst] = x[offset_src];

														
 
															+    } else {

														
 
															+        dst[offset_dst] = 0.0f;

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+static void pad_f32_cuda(const float * x, float * dst,

														
 
															+    const int ne00, const int ne01, const int ne02, const int ne03,

														
 
															+    const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {

														
 
															+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;

														
 
															+    dim3 gridDim(num_blocks, ne1, ne2*ne3);

														
 
															+    pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

														
 
															+

														
 
															+    pad_f32_cuda(src0_d, dst_d,

														
 
															+        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],

														
 
															+        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);

														
 
															+}

														
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_PAD_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_PAD_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,94 +1,94 @@
 
															-#include "pool2d.cuh"
														
 
															-
														
 
															-template <typename Ti, typename To>
														
 
															-static  __global__ void pool2d_nchw_kernel(
														
 
															-        const int ih, const int iw, const int oh, const int ow,
														
 
															-        const int kh, const int kw, const int sh, const int sw,
														
 
															-        const int ph, const int pw, const int parallel_elements,
														
 
															-        const Ti* src, To* dst, const enum ggml_op_pool op) {
														
 
															-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
														
 
															-    if (idx >= parallel_elements) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    const int I_HW = ih * iw;
														
 
															-    const int O_HW = oh * ow;
														
 
															-    const int nc = idx / O_HW;
														
 
															-    const int cur_oh = idx % O_HW / ow;
														
 
															-    const int cur_ow = idx % O_HW % ow;
														
 
															-    const Ti* i_ptr = src + nc * I_HW;
														
 
															-    To* o_ptr = dst + nc * O_HW;
														
 
															-    const int start_h = cur_oh * sh - ph;
														
 
															-    const int bh = max(0, start_h);
														
 
															-    const int eh = min(ih, start_h + kh);
														
 
															-    const int start_w = cur_ow * sw - pw;
														
 
															-    const int bw = max(0, start_w);
														
 
															-    const int ew = min(iw, start_w + kw);
														
 
															-    const To scale = 1. / (kh * kw);
														
 
															-    To res = 0;
														
 
															-
														
 
															-    switch (op) {
														
 
															-        case GGML_OP_POOL_AVG: res = 0; break;
														
 
															-        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
														
 
															-        default: assert(false);
														
 
															-    }
														
 
															-
														
 
															-    for (int i = bh; i < eh; i += 1) {
														
 
															-        for (int j = bw; j < ew; j += 1) {
														
 
															-#if __CUDA_ARCH__ >= 350
														
 
															-            Ti cur = __ldg(i_ptr + i * iw + j);
														
 
															-#else
														
 
															-            Ti cur = i_ptr[i * iw + j];
														
 
															-#endif
														
 
															-            switch (op) {
														
 
															-                case GGML_OP_POOL_AVG: res += cur * scale; break;
														
 
															-                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
														
 
															-                default: assert(false);
														
 
															-            }
														
 
															-        }
														
 
															-    }
														
 
															-    o_ptr[cur_oh * ow + cur_ow] = res;
														
 
															-}
														
 
															-
														
 
															-static void pool2d_nchw_kernel_f32_f32_cuda(
														
 
															-        const int ih, const int iw, const int oh, const int ow,
														
 
															-        const int kh, const int kw, const int sh, const int sw,
														
 
															-        const int ph, const int pw, const int parallel_elements,
														
 
															-        const float * src, float * dst, const enum ggml_op_pool op,
														
 
															-        cudaStream_t stream) {
														
 
															-
														
 
															-    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
														
 
															-    dim3 block_nums(num_blocks);
														
 
															-    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															-
														
 
															-    const int32_t * opts = (const int32_t *)dst->op_params;
														
 
															-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
														
 
															-    const int k0 = opts[1];
														
 
															-    const int k1 = opts[2];
														
 
															-    const int s0 = opts[3];
														
 
															-    const int s1 = opts[4];
														
 
															-    const int p0 = opts[5];
														
 
															-    const int p1 = opts[6];
														
 
															-
														
 
															-    const int64_t IH = src0->ne[1];
														
 
															-    const int64_t IW = src0->ne[0];
														
 
															-
														
 
															-    const int64_t N = dst->ne[3];
														
 
															-    const int64_t OC = dst->ne[2];
														
 
															-    const int64_t OH = dst->ne[1];
														
 
															-    const int64_t OW = dst->ne[0];
														
 
															-
														
 
															-    const int parallel_elements = N * OC * OH * OW;
														
 
															-
														
 
															-    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
														
 
															-}
														
 
															+#include "pool2d.cuh"

														
 
															+

														
 
															+template <typename Ti, typename To>

														
 
															+static  __global__ void pool2d_nchw_kernel(

														
 
															+        const int ih, const int iw, const int oh, const int ow,

														
 
															+        const int kh, const int kw, const int sh, const int sw,

														
 
															+        const int ph, const int pw, const int parallel_elements,

														
 
															+        const Ti* src, To* dst, const enum ggml_op_pool op) {

														
 
															+    int idx = threadIdx.x + blockIdx.x * blockDim.x;

														
 
															+    if (idx >= parallel_elements) {

														
 
															+        return;

														
 
															+    }

														
 
															+

														
 
															+    const int I_HW = ih * iw;

														
 
															+    const int O_HW = oh * ow;

														
 
															+    const int nc = idx / O_HW;

														
 
															+    const int cur_oh = idx % O_HW / ow;

														
 
															+    const int cur_ow = idx % O_HW % ow;

														
 
															+    const Ti* i_ptr = src + nc * I_HW;

														
 
															+    To* o_ptr = dst + nc * O_HW;

														
 
															+    const int start_h = cur_oh * sh - ph;

														
 
															+    const int bh = max(0, start_h);

														
 
															+    const int eh = min(ih, start_h + kh);

														
 
															+    const int start_w = cur_ow * sw - pw;

														
 
															+    const int bw = max(0, start_w);

														
 
															+    const int ew = min(iw, start_w + kw);

														
 
															+    const To scale = 1. / (kh * kw);

														
 
															+    To res = 0;

														
 
															+

														
 
															+    switch (op) {

														
 
															+        case GGML_OP_POOL_AVG: res = 0; break;

														
 
															+        case GGML_OP_POOL_MAX: res = -FLT_MAX; break;

														
 
															+        default: assert(false);

														
 
															+    }

														
 
															+

														
 
															+    for (int i = bh; i < eh; i += 1) {

														
 
															+        for (int j = bw; j < ew; j += 1) {

														
 
															+#if __CUDA_ARCH__ >= 350

														
 
															+            Ti cur = __ldg(i_ptr + i * iw + j);

														
 
															+#else

														
 
															+            Ti cur = i_ptr[i * iw + j];

														
 
															+#endif

														
 
															+            switch (op) {

														
 
															+                case GGML_OP_POOL_AVG: res += cur * scale; break;

														
 
															+                case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;

														
 
															+                default: assert(false);

														
 
															+            }

														
 
															+        }

														
 
															+    }

														
 
															+    o_ptr[cur_oh * ow + cur_ow] = res;

														
 
															+}

														
 
															+

														
 
															+static void pool2d_nchw_kernel_f32_f32_cuda(

														
 
															+        const int ih, const int iw, const int oh, const int ow,

														
 
															+        const int kh, const int kw, const int sh, const int sw,

														
 
															+        const int ph, const int pw, const int parallel_elements,

														
 
															+        const float * src, float * dst, const enum ggml_op_pool op,

														
 
															+        cudaStream_t stream) {

														
 
															+

														
 
															+    const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;

														
 
															+    dim3 block_nums(num_blocks);

														
 
															+    pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

														
 
															+

														
 
															+    const int32_t * opts = (const int32_t *)dst->op_params;

														
 
															+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);

														
 
															+    const int k0 = opts[1];

														
 
															+    const int k1 = opts[2];

														
 
															+    const int s0 = opts[3];

														
 
															+    const int s1 = opts[4];

														
 
															+    const int p0 = opts[5];

														
 
															+    const int p1 = opts[6];

														
 
															+

														
 
															+    const int64_t IH = src0->ne[1];

														
 
															+    const int64_t IW = src0->ne[0];

														
 
															+

														
 
															+    const int64_t N = dst->ne[3];

														
 
															+    const int64_t OC = dst->ne[2];

														
 
															+    const int64_t OH = dst->ne[1];

														
 
															+    const int64_t OW = dst->ne[0];

														
 
															+

														
 
															+    const int parallel_elements = N * OC * OH * OW;

														
 
															+

														
 
															+    pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);

														
 
															+}

														
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_POOL2D_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_POOL2D_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,22 +1,23 @@
 
															 #include "quantize.cuh"
														
 
															+#include <cstdint>
														
 
															-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
														
 
															-    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
														
 
															+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
														
 
															-    if (ix >= kx_padded) {
														
 
															+    if (ix0 >= kx0_padded) {
														
 
															         return;
														
 
															     }
														
 
															-    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
														
 
															+    const int64_t ix1 = blockIdx.y;
														
 
															-    const int64_t i_padded = (int64_t)iy*kx_padded + ix;
														
 
															+    const int64_t i_padded = ix1*kx0_padded + ix0;
														
 
															     block_q8_1 * y = (block_q8_1 *) vy;
														
 
															     const int64_t ib = i_padded / QK8_1; // block index
														
 
															     const int64_t iqs = i_padded % QK8_1; // quant index
														
 
															-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
														
 
															+    const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
														
 
															     float amax = fabsf(xi);
														
 
															     float sum = xi;
														
@@ -36,10 +37,76 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
 
															     reinterpret_cast<half&>(y[ib].ds.y) = sum;
														
 
															 }
														
 
															-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
														
 
															-    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
														
 
															-    const dim3 num_blocks(block_num_x, ky, 1);
														
 
															+template <bool need_sum>
														
 
															+static __global__ void quantize_mmq_q8_1(
														
 
															+    const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
														
 
															+
														
 
															+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
														
 
															+
														
 
															+    if (ix0 >= kx0_padded) {
														
 
															+        return;
														
 
															+    }
														
 
															+
														
 
															+    const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
														
 
															+
														
 
															+    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
														
 
															+
														
 
															+    const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
														
 
															+    const int64_t ib  = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y;              // block index in channel
														
 
															+    const int64_t iqs = ix0 % (4*QK8_1);                                       // quant index in block
														
 
															+
														
 
															+    const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
														
 
															+    float amax = fabsf(xi);
														
 
															+
														
 
															+    amax = warp_reduce_max(amax);
														
 
															+
														
 
															+    float sum;
														
 
															+    if (need_sum) {
														
 
															+        sum = warp_reduce_sum(xi);
														
 
															+    }
														
 
															+
														
 
															+    const float d = amax / 127;
														
 
															+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
														
 
															+
														
 
															+    y[ib].qs[iqs] = q;
														
 
															+
														
 
															+    if (iqs % QK8_1 != 0) {
														
 
															+        return;
														
 
															+    }
														
 
															+
														
 
															+    if (need_sum) {
														
 
															+        y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
														
 
															+    } else {
														
 
															+        ((float *) y[ib].ds)[iqs/QK8_1] = d;
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+void quantize_row_q8_1_cuda(
														
 
															+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
														
 
															+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
														
 
															+
														
 
															+    GGML_ASSERT(kx0_padded % QK8_1 == 0);
														
 
															+
														
 
															+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
														
 
															+    const dim3 num_blocks(block_num_x, kx1*channels, 1);
														
 
															     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
														
 
															-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
														
 
															+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
														
 
															+
														
 
															+    GGML_UNUSED(type_x);
														
 
															 }
														
 
															+void quantize_mmq_q8_1_cuda(
														
 
															+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
														
 
															+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
														
 
															+
														
 
															+    GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
														
 
															+
														
 
															+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
														
 
															+    const dim3 num_blocks(block_num_x, kx1, channels);
														
 
															+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
														
 
															+    if (mmq_need_sum(type_x)) {
														
 
															+        quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
														
 
															+    } else {
														
 
															+        quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
														
 
															+    }
														
 
															+}
														
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,20 @@
 
															+#pragma once
														
 
															+
														
 
															 #include "common.cuh"
														
 
															+#include "mmq.cuh"
														
 
															+
														
 
															+#include <cstdint>
														
 
															 #define CUDA_QUANTIZE_BLOCK_SIZE 256
														
 
															-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
														
 
															+typedef void (*quantize_cuda_t)(
														
 
															+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
														
 
															+    const ggml_type type_x, cudaStream_t stream);
														
 
															+
														
 
															+void quantize_row_q8_1_cuda(
														
 
															+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
														
 
															+    const ggml_type type_x, cudaStream_t stream);
														
 
															+
														
 
															+void quantize_mmq_q8_1_cuda(
														
 
															+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
														
 
															+    const ggml_type type_x, cudaStream_t stream);
														
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_ROPE_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_ROPE_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_SCALE_BLOCK_SIZE 256
														
 
															-
														
 
															-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_SCALE_BLOCK_SIZE 256

														
 
															+

														
 
															+void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
														
 
															-
														
 
															-void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024

														
 
															+

														
 
															+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,40 +1,40 @@
 
															-#include "sumrows.cuh"
														
 
															-
														
 
															-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
														
 
															-    const int row = blockIdx.x;
														
 
															-    const int col = threadIdx.x;
														
 
															-
														
 
															-    float sum = 0.0f;
														
 
															-    for (int i = col; i < ncols; i += blockDim.x) {
														
 
															-        sum += x[row * ncols + i];
														
 
															-    }
														
 
															-
														
 
															-    sum = warp_reduce_sum(sum);
														
 
															-
														
 
															-    if (col == 0) {
														
 
															-        dst[row] = sum;
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
														
 
															-    const dim3 block_dims(WARP_SIZE, 1, 1);
														
 
															-    const dim3 block_nums(nrows, 1, 1);
														
 
															-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
														
 
															-}
														
 
															-
														
 
															-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
														
 
															-    const ggml_tensor * src0 = dst->src[0];
														
 
															-    const float * src0_d = (const float *)src0->data;
														
 
															-    float * dst_d = (float *)dst->data;
														
 
															-    cudaStream_t stream = ctx.stream();
														
 
															-
														
 
															-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
														
 
															-    GGML_ASSERT(ggml_is_contiguous(src0));
														
 
															-
														
 
															-
														
 
															-    const int64_t ncols = src0->ne[0];
														
 
															-    const int64_t nrows = ggml_nrows(src0);
														
 
															-
														
 
															-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
														
 
															-}
														
 
															+#include "sumrows.cuh"

														
 
															+

														
 
															+static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {

														
 
															+    const int row = blockIdx.x;

														
 
															+    const int col = threadIdx.x;

														
 
															+

														
 
															+    float sum = 0.0f;

														
 
															+    for (int i = col; i < ncols; i += blockDim.x) {

														
 
															+        sum += x[row * ncols + i];

														
 
															+    }

														
 
															+

														
 
															+    sum = warp_reduce_sum(sum);

														
 
															+

														
 
															+    if (col == 0) {

														
 
															+        dst[row] = sum;

														
 
															+    }

														
 
															+}

														
 
															+

														
 
															+static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {

														
 
															+    const dim3 block_dims(WARP_SIZE, 1, 1);

														
 
															+    const dim3 block_nums(nrows, 1, 1);

														
 
															+    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);

														
 
															+}

														
 
															+

														
 
															+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

														
 
															+    const ggml_tensor * src0 = dst->src[0];

														
 
															+    const float * src0_d = (const float *)src0->data;

														
 
															+    float * dst_d = (float *)dst->data;

														
 
															+    cudaStream_t stream = ctx.stream();

														
 
															+

														
 
															+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

														
 
															+    GGML_ASSERT(ggml_is_contiguous(src0));

														
 
															+

														
 
															+

														
 
															+    const int64_t ncols = src0->ne[0];

														
 
															+    const int64_t nrows = ggml_nrows(src0);

														
 
															+

														
 
															+    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);

														
 
															+}

														
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,3 +1,3 @@
 
															-#include "common.cuh"
														
 
															-
														
 
															-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
														
 
															+#include "common.cuh"

														
 
															+

														
 
															+void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f16.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f32.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f32.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f32.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
														
--- a/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -0,0 +1,5 @@
 
															+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
														
 
															+
														
 
															+#include "../fattn-vec-f32.cuh"
														
 
															+
														
 
															+DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
 															 *.lib
 															 *.exp
 															 *.dll
+															+*.o