123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- /**
- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
- #pragma once
- #include "llama.h"
- #include "llama-impl.h"
- #include "llama-arch.h"
- #include "llama-mmap.h"
- #include "ggml-cpp.h"
- #include <cstddef>
- #include <map>
- #include <stdexcept>
- #include <unordered_map>
- using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
- enum llama_fver {
- GGUF_FILE_VERSION_V1 = 1,
- GGUF_FILE_VERSION_V2 = 2,
- GGUF_FILE_VERSION_V3 = 3,
- };
- const char * llama_file_version_name(llama_fver version);
- struct llama_model_loader {
- // Holds information on a model weight
- struct llama_tensor_weight {
- uint16_t idx; // source file index
- size_t offs; // tensor data offset in the original file
- ggml_tensor * tensor;
- llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
- const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
- if (tensor_idx < 0) {
- throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
- }
- offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
- if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
- throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
- }
- }
- };
- // custom comparator to sort weights more nicely by layer
- struct weight_name_comparer {
- bool operator()(const std::string & a, const std::string & b) const {
- int a_layer = -1;
- int b_layer = -1;
- sscanf(a.c_str(), "blk.%d.", &a_layer);
- sscanf(b.c_str(), "blk.%d.", &b_layer);
- if (a_layer != b_layer) {
- return a_layer < b_layer;
- }
- return a < b;
- }
- };
- static const int TENSOR_NOT_REQUIRED = 1;
- static const int TENSOR_DUPLICATED = 2;
- int n_kv = 0;
- int n_tensors = 0;
- int n_created = 0;
- uint64_t n_elements = 0;
- size_t n_bytes = 0;
- bool use_mmap = false;
- bool check_tensors;
- llama_files files;
- llama_ftype ftype;
- llama_fver fver;
- llama_mmaps mappings;
- std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
- std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
- gguf_context_ptr meta;
- std::vector<ggml_context_ptr> contexts;
- std::string arch_name;
- LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
- size_t size_done = 0;
- size_t size_data = 0;
- std::vector<std::pair<size_t, size_t>> mmaps_used;
- llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
- template<typename T>
- typename std::enable_if<std::is_integral<T>::value, bool>::type
- get_arr_n(const std::string & key, T & result, bool required = true);
- template<typename T>
- typename std::enable_if<std::is_integral<T>::value, bool>::type
- get_arr_n(enum llm_kv kid, T & result, bool required = true);
- template<typename T>
- bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
- template<typename T, size_t N_MAX>
- bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
- template<typename T>
- bool get_arr(enum llm_kv kid, T & result, bool required = true);
- template<typename T>
- bool get_key(const std::string & key, T & result, bool required = true);
- template<typename T>
- bool get_key(enum llm_kv kid, T & result, bool required = true);
- template<typename T, size_t N_MAX>
- bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
- template<typename T>
- bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
- std::string get_arch_name() const;
- enum llm_arch get_arch() const;
- const llama_tensor_weight * get_weight(const char * name) const;
- const llama_tensor_weight & require_weight(const char * name) const;
- struct ggml_tensor * get_tensor_meta(const char * name) const;
- struct ggml_tensor * require_tensor_meta(const std::string & name) const;
- const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
- struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
- void done_getting_tensors() const;
- void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
- void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
- // for backwards compatibility, does not support ggml-backend
- void load_data_for(struct ggml_tensor * cur) const;
- // Returns false if cancelled by progress_callback
- bool load_all_data(
- struct ggml_context * ctx,
- llama_buf_map & bufs,
- llama_mlocks * lmlocks,
- llama_progress_callback progress_callback,
- void * progress_callback_user_data);
- };
|