|
@@ -913,7 +913,9 @@ struct llama_server_context
|
|
slot.sampled = result.tok;
|
|
slot.sampled = result.tok;
|
|
|
|
|
|
// search stop word and delete it
|
|
// search stop word and delete it
|
|
- slot.generated_text += token_str;
|
|
|
|
|
|
+ if (!llama_token_is_eog(model, result.tok))
|
|
|
|
+ slot.generated_text += token_str;
|
|
|
|
+
|
|
slot.has_next_token = true;
|
|
slot.has_next_token = true;
|
|
|
|
|
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
|
@@ -954,30 +956,36 @@ struct llama_server_context
|
|
if (!incomplete)
|
|
if (!incomplete)
|
|
{
|
|
{
|
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
|
- const std::string str_test = slot.generated_text.substr(pos);
|
|
|
|
- bool is_stop_full = false;
|
|
|
|
- size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
|
|
|
- if (stop_pos != std::string::npos)
|
|
|
|
- {
|
|
|
|
- is_stop_full = true;
|
|
|
|
- slot.generated_text.erase(
|
|
|
|
- slot.generated_text.begin() + pos + stop_pos,
|
|
|
|
- slot.generated_text.end());
|
|
|
|
- pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
|
|
|
- }
|
|
|
|
- else
|
|
|
|
- {
|
|
|
|
- is_stop_full = false;
|
|
|
|
- stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
|
|
|
|
- }
|
|
|
|
|
|
|
|
- // check if there is any token to predict
|
|
|
|
- if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
|
|
|
|
- {
|
|
|
|
- // no send the stop word in the response
|
|
|
|
- result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
|
|
|
- slot.n_sent_text += result.text_to_send.size();
|
|
|
|
- // add the token to slot queue and cache
|
|
|
|
|
|
+ if (!llama_token_is_eog(model, result.tok)) {
|
|
|
|
+ const std::string str_test = slot.generated_text.substr(pos);
|
|
|
|
+ bool is_stop_full = false;
|
|
|
|
+ size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
|
|
|
+ if (stop_pos != std::string::npos)
|
|
|
|
+ {
|
|
|
|
+ is_stop_full = true;
|
|
|
|
+ slot.generated_text.erase(
|
|
|
|
+ slot.generated_text.begin() + pos + stop_pos,
|
|
|
|
+ slot.generated_text.end());
|
|
|
|
+ pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
|
|
|
+ }
|
|
|
|
+ else
|
|
|
|
+ {
|
|
|
|
+ is_stop_full = false;
|
|
|
|
+ stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // check if there is any token to predict
|
|
|
|
+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
|
|
|
|
+ {
|
|
|
|
+ // no send the stop word in the response
|
|
|
|
+ result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
|
|
|
+ slot.n_sent_text += result.text_to_send.size();
|
|
|
|
+ // add the token to slot queue and cache
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
|
|
|
+ slot.n_sent_text += result.text_to_send.size();
|
|
}
|
|
}
|
|
|
|
|
|
if (slot.params.stream)
|
|
if (slot.params.stream)
|
|
@@ -1117,9 +1125,7 @@ struct llama_server_context
|
|
{"multimodal", multimodal}
|
|
{"multimodal", multimodal}
|
|
};
|
|
};
|
|
|
|
|
|
- if (!llama_token_is_eog(model, tkn.tok)) {
|
|
|
|
- res.result_json["content"] = tkn.text_to_send;
|
|
|
|
- }
|
|
|
|
|
|
+ res.result_json["content"] = tkn.text_to_send;
|
|
|
|
|
|
if (slot.sparams.n_probs > 0)
|
|
if (slot.sparams.n_probs > 0)
|
|
{
|
|
{
|