From 5a2fde838596887d165d75ce686ee2eee7695f41 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 22 Jun 2024 20:24:14 +0200 Subject: [PATCH 01/10] add chat template support for llama-cli --- common/common.cpp | 42 ++++++++++++++++++++++ common/common.h | 19 ++++++++++ examples/main/main.cpp | 69 ++++++++++++++++++++++++++---------- llama.cpp | 4 +-- tests/test-chat-template.cpp | 20 +++++++++++ 5 files changed, 134 insertions(+), 20 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index cfdedcbae0cd9..e88ce3f571c33 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2967,12 +2967,54 @@ bool llama_should_add_bos_token(const llama_model * model) { return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); } +// +// Chat template utils +// + bool llama_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } +std::string llama_chat_format(const struct llama_model * model, + const std::string & tmpl, + const std::vector & msgs, + bool add_ass) { + std::vector chat; + for (auto & msg : msgs) { + chat.push_back({msg.role.c_str(), msg.content.c_str()}); + } + + const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + std::vector buf; + + // run the first time to get the total output length + int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + } + + const std::string formatted_chat(buf.data(), res); + return formatted_chat; +} + +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass) { + auto fmt_past_msg = llama_chat_format(model, tmpl, past_msg, false); + std::vector chat_new(past_msg); + chat_new.push_back(new_msg); + auto fmt_new_msg = llama_chat_format(model, tmpl, chat_new, add_ass); + auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return formatted; +} + // // KV cache utils // diff --git a/common/common.h b/common/common.h index 9a1dc4a2fe4c1..1e4f1583dfb78 100644 --- a/common/common.h +++ b/common/common.h @@ -360,9 +360,28 @@ bool llama_should_add_bos_token(const llama_model * model); // Chat template utils // +// same with llama_chat_message, but uses std::string +struct llama_chat_msg { + std::string role; + std::string content; +}; + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool llama_chat_verify_template(const std::string & tmpl); +// CPP wrapper for llama_chat_apply_template +std::string llama_chat_format(const struct llama_model * model, + const std::string & tmpl, + const std::vector & chat, + bool add_ass); + +// Format single message, while taking into account the position of that message in chat history +std::string llama_chat_format_single(const struct llama_model * model, + const std::string & tmpl, + const std::vector & past_msg, + const llama_chat_msg & new_msg, + bool add_ass); + // // KV cache utils // diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b97b7b7937f02..f0770ac443212 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,20 +31,21 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static llama_context ** g_ctx; -static llama_model ** g_model; -static gpt_params * g_params; -static std::vector * g_input_tokens; -static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; +static std::vector * g_chat_msgs; static bool is_interacting = false; -static bool file_exists(const std::string &path) { +static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); } -static bool file_is_empty(const std::string &path) { +static bool file_is_empty(const std::string & path) { std::ifstream f; f.exceptions(std::ifstream::failbit | std::ifstream::badbit); f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); @@ -117,6 +118,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } +static std::string chat_add_and_format(std::string role, std::string content) { + llama_chat_msg new_msg{role, content}; + auto formatted = llama_chat_format_single( + *g_model, g_params->chat_template, *g_chat_msgs, new_msg, role == "user"); + g_chat_msgs->push_back({role, content}); + return formatted; +} + int main(int argc, char ** argv) { gpt_params params; g_params = ¶ms; @@ -190,8 +199,10 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; llama_context * ctx_guidance = NULL; + std::vector chat_msgs; g_model = &model; g_ctx = &ctx; + g_chat_msgs = &chat_msgs; // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); @@ -249,16 +260,21 @@ int main(int argc, char ** argv) { std::vector embd_inp; - if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); - } else { - LOG("use session tokens\n"); - embd_inp = session_tokens; - } + { + auto prompt = params.conversation + ? chat_add_and_format("system", params.prompt) // format the system prompt in conversation mode + : params.prompt; + if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { + LOG("tokenize the prompt\n"); + embd_inp = ::llama_tokenize(ctx, prompt, true, true); + } else { + LOG("use session tokens\n"); + embd_inp = session_tokens; + } - LOG("prompt: \"%s\"\n", log_tostr(params.prompt)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG("prompt: \"%s\"\n", log_tostr(prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } // Should not run without any tokens if (embd_inp.empty()) { @@ -478,6 +494,7 @@ int main(int argc, char ** argv) { std::vector input_tokens; g_input_tokens = &input_tokens; std::vector output_tokens; g_output_tokens = &output_tokens; std::ostringstream output_ss; g_output_ss = &output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); @@ -793,11 +810,18 @@ int main(int argc, char ** argv) { is_antiprompt = true; } + chat_add_and_format("system", assistant_ss.str()); is_interacting = true; printf("\n"); } } + // if current token is not EOG, we add it to current assistant message + if (params.conversation) { + auto id = llama_sampling_last(ctx_sampling); + assistant_ss << llama_token_to_piece(ctx, id, false); + } + if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); @@ -848,8 +872,14 @@ int main(int argc, char ** argv) { string_process_escapes(buffer); } + std::string user_inp = params.conversation + ? chat_add_and_format("user", buffer) + : buffer; + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) + bool accept_special_content = params.conversation; + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, accept_special_content); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); @@ -864,6 +894,9 @@ int main(int argc, char ** argv) { output_ss << llama_token_to_piece(ctx, token); } + // reset assistant message + assistant_ss.str(""); + n_remain -= line_inp.size(); LOG("n_remain: %d\n", n_remain); } else { diff --git a/llama.cpp b/llama.cpp index a05a52b4234cd..0c3f15e512e0e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -18589,10 +18589,10 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) { + } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) { // llama2 template and its variants // [variant] support system message - bool support_system_message = tmpl.find("<>") != std::string::npos; + bool support_system_message = tmpl.find("<>") != std::string::npos || tmpl == "mistral"; // [variant] space before + after response bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; // [variant] add BOS inside history diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index cef9a650bdfdf..d19ba8633e8c2 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -7,6 +7,7 @@ #include #include "llama.h" +#include "common.h" int main(void) { llama_chat_message conversation[] = { @@ -119,5 +120,24 @@ int main(void) { std::cout << output << "\n-------------------------\n"; assert(output == expected); } + + // test llama_chat_format_single + std::cout << "\n\n=== llama_chat_format_single ===\n\n"; + std::vector chat2; + chat2.push_back({"system", "You are a helpful assistant"}); + chat2.push_back({"user", "Hello"}); + chat2.push_back({"assistant", "I am assistant"}); + llama_chat_msg new_msg{"user", "How are you"}; + + auto fmt_single = [&](std::string tmpl) { + auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); + std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n"; + return output; + }; + assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); + assert(fmt_single("llama2") == "[INST] How are you [/INST]"); + assert(fmt_single("gemma") == "user\nHow are you\nmodel\n"); + assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); + return 0; } From c91f972775285f9e16ed3f484181aec05081f11b Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 22 Jun 2024 20:25:26 +0200 Subject: [PATCH 02/10] add help message --- common/common.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index e88ce3f571c33..8667bf41418d6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1814,7 +1814,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main", " --cfg-negative-prompt-file FNAME", "negative prompt file to use for guidance" }); options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); - + options.push_back({ "main", " --chat-template JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "only commonly used templates are accepted:\n" + "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "grammar" }); options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); From 317452730dd05800e88c9e8fd1e85d972d399439 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 22 Jun 2024 20:30:33 +0200 Subject: [PATCH 03/10] server: simplify format_chat --- examples/server/utils.hpp | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 63fde9c9faabe..4eb0c56a3410d 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { - size_t alloc_size = 0; - // vector holding all allocated string to be passed to llama_chat_apply_template - std::vector str(messages.size() * 2); - std::vector chat(messages.size()); + std::vector chat; for (size_t i = 0; i < messages.size(); ++i) { const auto & curr_msg = messages[i]; - str[i*2 + 0] = json_value(curr_msg, "role", std::string("")); - str[i*2 + 1] = json_value(curr_msg, "content", std::string("")); - alloc_size += str[i*2 + 1].length(); - chat[i].role = str[i*2 + 0].c_str(); - chat[i].content = str[i*2 + 1].c_str(); + std::string role = json_value(curr_msg, "role", std::string("")); + std::string content = json_value(curr_msg, "content", std::string("")); + chat.push_back({role, content}); } - const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); - std::vector buf(alloc_size * 2); - - // run the first time to get the total output length - int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - - // if it turns out that our buffer is too small, we resize it - if ((size_t) res > buf.size()) { - buf.resize(res); - res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size()); - } - - const std::string formatted_chat(buf.data(), res); - + auto formatted_chat = llama_chat_format(model, tmpl, chat, true); LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); - return formatted_chat; } From 962be6a834638746a16e3a2b9457314cdbed1b27 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 22 Jun 2024 22:57:16 +0200 Subject: [PATCH 04/10] more consistent naming --- common/common.cpp | 6 +++--- common/common.h | 2 +- examples/main/main.cpp | 4 +--- examples/server/utils.hpp | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 8667bf41418d6..388f650eca83d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2980,7 +2980,7 @@ bool llama_chat_verify_template(const std::string & tmpl) { return res >= 0; } -std::string llama_chat_format(const struct llama_model * model, +std::string llama_chat_apply_template(const struct llama_model * model, const std::string & tmpl, const std::vector & msgs, bool add_ass) { @@ -3010,10 +3010,10 @@ std::string llama_chat_format_single(const struct llama_model * model, const std::vector & past_msg, const llama_chat_msg & new_msg, bool add_ass) { - auto fmt_past_msg = llama_chat_format(model, tmpl, past_msg, false); + auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); chat_new.push_back(new_msg); - auto fmt_new_msg = llama_chat_format(model, tmpl, chat_new, add_ass); + auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); return formatted; } diff --git a/common/common.h b/common/common.h index 1e4f1583dfb78..6a64bb22b855b 100644 --- a/common/common.h +++ b/common/common.h @@ -370,7 +370,7 @@ struct llama_chat_msg { bool llama_chat_verify_template(const std::string & tmpl); // CPP wrapper for llama_chat_apply_template -std::string llama_chat_format(const struct llama_model * model, +std::string llama_chat_apply_template(const struct llama_model * model, const std::string & tmpl, const std::vector & chat, bool add_ass); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f0770ac443212..36f060401916d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -876,10 +876,8 @@ int main(int argc, char ** argv) { ? chat_add_and_format("user", buffer) : buffer; // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) - bool accept_special_content = params.conversation; - const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, user_inp, false, accept_special_content); + const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 4eb0c56a3410d..7ef2a519a10c7 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -127,7 +127,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri chat.push_back({role, content}); } - auto formatted_chat = llama_chat_format(model, tmpl, chat, true); + auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); return formatted_chat; } From 43cab6bfc6b61896857077eecde818499db7da8a Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 24 Jun 2024 10:45:31 +0200 Subject: [PATCH 05/10] improve --- common/common.cpp | 6 ++++-- examples/main/main.cpp | 18 ++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 388f650eca83d..b1de5615bfd9a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2984,13 +2984,15 @@ std::string llama_chat_apply_template(const struct llama_model * model, const std::string & tmpl, const std::vector & msgs, bool add_ass) { + int alloc_size = 0; std::vector chat; for (auto & msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); + alloc_size += (msg.role.size() + msg.content.size()) * 1.25; } const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); - std::vector buf; + std::vector buf(alloc_size); // run the first time to get the total output length int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); @@ -3001,7 +3003,7 @@ std::string llama_chat_apply_template(const struct llama_model * model, res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); } - const std::string formatted_chat(buf.data(), res); + std::string formatted_chat(buf.data(), res); return formatted_chat; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 36f060401916d..e1f0a1a12fe67 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -37,7 +37,6 @@ static gpt_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; -static std::vector * g_chat_msgs; static bool is_interacting = false; static bool file_exists(const std::string & path) { @@ -118,13 +117,13 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } -static std::string chat_add_and_format(std::string role, std::string content) { +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { llama_chat_msg new_msg{role, content}; auto formatted = llama_chat_format_single( - *g_model, g_params->chat_template, *g_chat_msgs, new_msg, role == "user"); - g_chat_msgs->push_back({role, content}); + model, g_params->chat_template, chat_msgs, new_msg, role == "user"); + chat_msgs.push_back({role, content}); return formatted; -} +}; int main(int argc, char ** argv) { gpt_params params; @@ -202,7 +201,6 @@ int main(int argc, char ** argv) { std::vector chat_msgs; g_model = &model; g_ctx = &ctx; - g_chat_msgs = &chat_msgs; // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); @@ -262,7 +260,7 @@ int main(int argc, char ** argv) { { auto prompt = params.conversation - ? chat_add_and_format("system", params.prompt) // format the system prompt in conversation mode + ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); @@ -810,7 +808,7 @@ int main(int argc, char ** argv) { is_antiprompt = true; } - chat_add_and_format("system", assistant_ss.str()); + chat_add_and_format(model, chat_msgs, "system", assistant_ss.str()); is_interacting = true; printf("\n"); } @@ -873,8 +871,8 @@ int main(int argc, char ** argv) { } std::string user_inp = params.conversation - ? chat_add_and_format("user", buffer) - : buffer; + ? chat_add_and_format(model, chat_msgs, "user", buffer) + : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation); From a3dbfabe93ff16674b38e75ceb3287adb6e29914 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 24 Jun 2024 10:52:17 +0200 Subject: [PATCH 06/10] add llama_chat_format_example --- common/common.cpp | 11 +++++++++++ common/common.h | 4 ++++ examples/main/main.cpp | 2 ++ examples/server/server.cpp | 12 ++---------- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b1de5615bfd9a..54e68accc2ccb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3020,6 +3020,17 @@ std::string llama_chat_format_single(const struct llama_model * model, return formatted; } +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl) { + std::vector msgs = { + {"system", "You are a helpful assistant"}, + {"user", "Hello"}, + {"assistant", "Hi there"}, + {"user", "How are you?"}, + }; + return llama_chat_apply_template(model, tmpl, msgs, true); +} + // // KV cache utils // diff --git a/common/common.h b/common/common.h index 6a64bb22b855b..04cb8c30cd320 100644 --- a/common/common.h +++ b/common/common.h @@ -382,6 +382,10 @@ std::string llama_chat_format_single(const struct llama_model * model, const llama_chat_msg & new_msg, bool add_ass); +// Returns an example of formatted chat +std::string llama_chat_format_example(const struct llama_model * model, + const std::string & tmpl); + // // KV cache utils // diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e1f0a1a12fe67..f76c885f45818 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -224,6 +224,8 @@ int main(int argc, char ** argv) { __func__, n_ctx_train, n_ctx); } + LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + // print system information { LOG_TEE("\n"); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f9a86961f9c8e..3aad57284e30a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2606,17 +2606,9 @@ int main(int argc, char ** argv) { // print sample chat example to make it clear which template is used { - json chat; - chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}}); - chat.push_back({{"role", "user"}, {"content", "Hello"}}); - chat.push_back({{"role", "assistant"}, {"content", "Hi there"}}); - chat.push_back({{"role", "user"}, {"content", "How are you?"}}); - - const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat); - LOG_INFO("chat template", { - {"chat_example", chat_example}, - {"built_in", params.chat_template.empty()}, + {"chat_example", llama_chat_format_example(model)}, + {"built_in", params.chat_template.empty()}, }); } From a1e9520995599e6686d78f854396ad55f1e46ee2 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 24 Jun 2024 10:56:55 +0200 Subject: [PATCH 07/10] fix server --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3aad57284e30a..ae768097baa0e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2607,7 +2607,7 @@ int main(int argc, char ** argv) { // print sample chat example to make it clear which template is used { LOG_INFO("chat template", { - {"chat_example", llama_chat_format_example(model)}, + {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, {"built_in", params.chat_template.empty()}, }); } From 7a7650231a74768873769de5c935b3d25f54eab7 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 24 Jun 2024 10:57:47 +0200 Subject: [PATCH 08/10] code style --- examples/main/main.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f76c885f45818..510c690430146 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,12 +31,12 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static llama_context ** g_ctx; -static llama_model ** g_model; -static gpt_params * g_params; -static std::vector * g_input_tokens; -static std::ostringstream * g_output_ss; -static std::vector * g_output_tokens; +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; static bool is_interacting = false; static bool file_exists(const std::string & path) { From a28e70fde859b017c979682a8108edc37171ee66 Mon Sep 17 00:00:00 2001 From: ngxson Date: Mon, 24 Jun 2024 11:07:08 +0200 Subject: [PATCH 09/10] code style --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 510c690430146..3a26d022025eb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -123,7 +123,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vectorchat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); return formatted; -}; +} int main(int argc, char ** argv) { gpt_params params; From 895bb2a697d96d8ff836697ccd1428e2c9ced3f4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 25 Jun 2024 10:50:25 +0200 Subject: [PATCH 10/10] Update examples/main/main.cpp Co-authored-by: Georgi Gerganov --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 3a26d022025eb..cfaf6a6e8ba4a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -873,7 +873,7 @@ int main(int argc, char ** argv) { } std::string user_inp = params.conversation - ? chat_add_and_format(model, chat_msgs, "user", buffer) + ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);