From 5a2fde838596887d165d75ce686ee2eee7695f41 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Sat, 22 Jun 2024 20:24:14 +0200
Subject: [PATCH 01/10] add chat template support for llama-cli

---
 common/common.cpp            | 42 ++++++++++++++++++++++
 common/common.h              | 19 ++++++++++
 examples/main/main.cpp       | 69 ++++++++++++++++++++++++++----------
 llama.cpp                    |  4 +--
 tests/test-chat-template.cpp | 20 +++++++++++
 5 files changed, 134 insertions(+), 20 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index cfdedcbae0cd9..e88ce3f571c33 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2967,12 +2967,54 @@ bool llama_should_add_bos_token(const llama_model * model) {
     return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
 }
 
+//
+// Chat template utils
+//
+
 bool llama_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
 
+std::string llama_chat_format(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & msgs,
+        bool add_ass) {
+    std::vector<llama_chat_message> chat;
+    for (auto & msg : msgs) {
+        chat.push_back({msg.role.c_str(), msg.content.c_str()});
+    }
+
+    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
+    std::vector<char> buf;
+
+    // run the first time to get the total output length
+    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+
+    // if it turns out that our buffer is too small, we resize it
+    if ((size_t) res > buf.size()) {
+        buf.resize(res);
+        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    }
+
+    const std::string formatted_chat(buf.data(), res);
+    return formatted_chat;
+}
+
+std::string llama_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & past_msg,
+        const llama_chat_msg & new_msg,
+        bool add_ass) {
+    auto fmt_past_msg = llama_chat_format(model, tmpl, past_msg, false);
+    std::vector<llama_chat_msg> chat_new(past_msg);
+    chat_new.push_back(new_msg);
+    auto fmt_new_msg = llama_chat_format(model, tmpl, chat_new, add_ass);
+    auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    return formatted;
+}
+
 //
 // KV cache utils
 //
diff --git a/common/common.h b/common/common.h
index 9a1dc4a2fe4c1..1e4f1583dfb78 100644
--- a/common/common.h
+++ b/common/common.h
@@ -360,9 +360,28 @@ bool llama_should_add_bos_token(const llama_model * model);
 // Chat template utils
 //
 
+// same with llama_chat_message, but uses std::string
+struct llama_chat_msg {
+    std::string role;
+    std::string content;
+};
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool llama_chat_verify_template(const std::string & tmpl);
 
+// CPP wrapper for llama_chat_apply_template
+std::string llama_chat_format(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & chat,
+        bool add_ass);
+
+// Format single message, while taking into account the position of that message in chat history
+std::string llama_chat_format_single(const struct llama_model * model,
+        const std::string & tmpl,
+        const std::vector<llama_chat_msg> & past_msg,
+        const llama_chat_msg & new_msg,
+        bool add_ass);
+
 //
 // KV cache utils
 //
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index b97b7b7937f02..f0770ac443212 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -31,20 +31,21 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
-static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
+static llama_context                 ** g_ctx;
+static llama_model                   ** g_model;
+static gpt_params                     * g_params;
+static std::vector<llama_token>       * g_input_tokens;
+static std::ostringstream             * g_output_ss;
+static std::vector<llama_token>       * g_output_tokens;
+static std::vector<llama_chat_msg>    * g_chat_msgs;
 static bool is_interacting = false;
 
-static bool file_exists(const std::string &path) {
+static bool file_exists(const std::string & path) {
     std::ifstream f(path.c_str());
     return f.good();
 }
 
-static bool file_is_empty(const std::string &path) {
+static bool file_is_empty(const std::string & path) {
     std::ifstream f;
     f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
@@ -117,6 +118,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
     LOG_TEE("%s", text);
 }
 
+static std::string chat_add_and_format(std::string role, std::string content) {
+    llama_chat_msg new_msg{role, content};
+    auto formatted = llama_chat_format_single(
+        *g_model, g_params->chat_template, *g_chat_msgs, new_msg, role == "user");
+    g_chat_msgs->push_back({role, content});
+    return formatted;
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
@@ -190,8 +199,10 @@ int main(int argc, char ** argv) {
     llama_model * model;
     llama_context * ctx;
     llama_context * ctx_guidance = NULL;
+    std::vector<llama_chat_msg> chat_msgs;
     g_model = &model;
     g_ctx = &ctx;
+    g_chat_msgs = &chat_msgs;
 
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
-    } else {
-        LOG("use session tokens\n");
-        embd_inp = session_tokens;
-    }
+    {
+        auto prompt = params.conversation
+            ? chat_add_and_format("system", params.prompt) // format the system prompt in conversation mode
+            : params.prompt;
+        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
+            LOG("tokenize the prompt\n");
+            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
+        } else {
+            LOG("use session tokens\n");
+            embd_inp = session_tokens;
+        }
 
-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG("prompt: \"%s\"\n", log_tostr(prompt));
+        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    }
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
@@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
     std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
     std::ostringstream output_ss;     g_output_ss     = &output_ss;
+    std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
 
     // the first thing we will do is to output the prompt, so set color accordingly
     console::set_display(console::prompt);
@@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
                         is_antiprompt = true;
                     }
 
+                    chat_add_and_format("system", assistant_ss.str());
                     is_interacting = true;
                     printf("\n");
                 }
             }
 
+            // if current token is not EOG, we add it to current assistant message
+            if (params.conversation) {
+                auto id = llama_sampling_last(ctx_sampling);
+                assistant_ss << llama_token_to_piece(ctx, id, false);
+            }
+
             if (n_past > 0 && is_interacting) {
                 LOG("waiting for user input\n");
 
@@ -848,8 +872,14 @@ int main(int argc, char ** argv) {
                         string_process_escapes(buffer);
                     }
 
+                    std::string user_inp = params.conversation
+                        ? chat_add_and_format("user", buffer)
+                        : buffer;
+                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
+                    bool accept_special_content = params.conversation;
+
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, accept_special_content);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -864,6 +894,9 @@ int main(int argc, char ** argv) {
                         output_ss << llama_token_to_piece(ctx, token);
                     }
 
+                    // reset assistant message
+                    assistant_ss.str("");
+
                     n_remain -= line_inp.size();
                     LOG("n_remain: %d\n", n_remain);
                 } else {
diff --git a/llama.cpp b/llama.cpp
index a05a52b4234cd..0c3f15e512e0e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -18589,10 +18589,10 @@ static int32_t llama_chat_apply_template_internal(
         if (add_ass) {
             ss << "<|im_start|>assistant\n";
         }
-    } else if (tmpl == "llama2" || tmpl.find("[INST]") != std::string::npos) {
+    } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
         // llama2 template and its variants
         // [variant] support system message
-        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
+        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
         // [variant] space before + after response
         bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
         // [variant] add BOS inside history
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index cef9a650bdfdf..d19ba8633e8c2 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -7,6 +7,7 @@
 #include <cassert>
 
 #include "llama.h"
+#include "common.h"
 
 int main(void) {
     llama_chat_message conversation[] = {
@@ -119,5 +120,24 @@ int main(void) {
         std::cout << output << "\n-------------------------\n";
         assert(output == expected);
     }
+
+    // test llama_chat_format_single
+    std::cout << "\n\n=== llama_chat_format_single ===\n\n";
+    std::vector<llama_chat_msg> chat2;
+    chat2.push_back({"system", "You are a helpful assistant"});
+    chat2.push_back({"user", "Hello"});
+    chat2.push_back({"assistant", "I am assistant"});
+    llama_chat_msg new_msg{"user", "How are you"};
+
+    auto fmt_single = [&](std::string tmpl) {
+        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
+        std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
+        return output;
+    };
+    assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
+    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
+    assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
+    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+
     return 0;
 }

From c91f972775285f9e16ed3f484181aec05081f11b Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Sat, 22 Jun 2024 20:25:26 +0200
Subject: [PATCH 02/10] add help message

---
 common/common.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index e88ce3f571c33..8667bf41418d6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1814,7 +1814,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
                                                                         "negative prompt file to use for guidance" });
     options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
-
+    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
+                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
+                                                                        "only commonly used templates are accepted:\n"
+                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
     options.push_back({ "grammar" });
     options.push_back({ "*",           "       --grammar GRAMMAR",      "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
     options.push_back({ "*",           "       --grammar-file FNAME",   "file to read grammar from" });

From 317452730dd05800e88c9e8fd1e85d972d399439 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Sat, 22 Jun 2024 20:30:33 +0200
Subject: [PATCH 03/10] server: simplify format_chat

---
 examples/server/utils.hpp | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 63fde9c9faabe..4eb0c56a3410d 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
 
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
+    std::vector<llama_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i) {
         const auto & curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
+        std::string role    = json_value(curr_msg, "role",    std::string(""));
+        std::string content = json_value(curr_msg, "content", std::string(""));
+        chat.push_back({role, content});
     }
 
-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    const std::string formatted_chat(buf.data(), res);
-
+    auto formatted_chat = llama_chat_format(model, tmpl, chat, true);
     LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
     return formatted_chat;
 }
 

From 962be6a834638746a16e3a2b9457314cdbed1b27 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Sat, 22 Jun 2024 22:57:16 +0200
Subject: [PATCH 04/10] more consistent naming

---
 common/common.cpp         | 6 +++---
 common/common.h           | 2 +-
 examples/main/main.cpp    | 4 +---
 examples/server/utils.hpp | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8667bf41418d6..388f650eca83d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2980,7 +2980,7 @@ bool llama_chat_verify_template(const std::string & tmpl) {
     return res >= 0;
 }
 
-std::string llama_chat_format(const struct llama_model * model,
+std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & msgs,
         bool add_ass) {
@@ -3010,10 +3010,10 @@ std::string llama_chat_format_single(const struct llama_model * model,
         const std::vector<llama_chat_msg> & past_msg,
         const llama_chat_msg & new_msg,
         bool add_ass) {
-    auto fmt_past_msg = llama_chat_format(model, tmpl, past_msg, false);
+    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
     std::vector<llama_chat_msg> chat_new(past_msg);
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = llama_chat_format(model, tmpl, chat_new, add_ass);
+    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
     auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return formatted;
 }
diff --git a/common/common.h b/common/common.h
index 1e4f1583dfb78..6a64bb22b855b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -370,7 +370,7 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);
 
 // CPP wrapper for llama_chat_apply_template
-std::string llama_chat_format(const struct llama_model * model,
+std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & chat,
         bool add_ass);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index f0770ac443212..36f060401916d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -876,10 +876,8 @@ int main(int argc, char ** argv) {
                         ? chat_add_and_format("user", buffer)
                         : buffer;
                     // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
-                    bool accept_special_content = params.conversation;
-
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, accept_special_content);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, params.conversation);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 4eb0c56a3410d..7ef2a519a10c7 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -127,7 +127,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat.push_back({role, content});
     }
 
-    auto formatted_chat = llama_chat_format(model, tmpl, chat, true);
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
     LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
     return formatted_chat;
 }

From 43cab6bfc6b61896857077eecde818499db7da8a Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Mon, 24 Jun 2024 10:45:31 +0200
Subject: [PATCH 05/10] improve

---
 common/common.cpp      |  6 ++++--
 examples/main/main.cpp | 18 ++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 388f650eca83d..b1de5615bfd9a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2984,13 +2984,15 @@ std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector<llama_chat_msg> & msgs,
         bool add_ass) {
+    int alloc_size = 0;
     std::vector<llama_chat_message> chat;
     for (auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
+        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
     }
 
     const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf;
+    std::vector<char> buf(alloc_size);
 
     // run the first time to get the total output length
     int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
@@ -3001,7 +3003,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
         res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
-    const std::string formatted_chat(buf.data(), res);
+    std::string formatted_chat(buf.data(), res);
     return formatted_chat;
 }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 36f060401916d..e1f0a1a12fe67 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -37,7 +37,6 @@ static gpt_params                     * g_params;
 static std::vector<llama_token>       * g_input_tokens;
 static std::ostringstream             * g_output_ss;
 static std::vector<llama_token>       * g_output_tokens;
-static std::vector<llama_chat_msg>    * g_chat_msgs;
 static bool is_interacting = false;
 
 static bool file_exists(const std::string & path) {
@@ -118,13 +117,13 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
     LOG_TEE("%s", text);
 }
 
-static std::string chat_add_and_format(std::string role, std::string content) {
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
     llama_chat_msg new_msg{role, content};
     auto formatted = llama_chat_format_single(
-        *g_model, g_params->chat_template, *g_chat_msgs, new_msg, role == "user");
-    g_chat_msgs->push_back({role, content});
+        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
+    chat_msgs.push_back({role, content});
     return formatted;
-}
+};
 
 int main(int argc, char ** argv) {
     gpt_params params;
@@ -202,7 +201,6 @@ int main(int argc, char ** argv) {
     std::vector<llama_chat_msg> chat_msgs;
     g_model = &model;
     g_ctx = &ctx;
-    g_chat_msgs = &chat_msgs;
 
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
@@ -262,7 +260,7 @@ int main(int argc, char ** argv) {
 
     {
         auto prompt = params.conversation
-            ? chat_add_and_format("system", params.prompt) // format the system prompt in conversation mode
+            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
             LOG("tokenize the prompt\n");
@@ -810,7 +808,7 @@ int main(int argc, char ** argv) {
                         is_antiprompt = true;
                     }
 
-                    chat_add_and_format("system", assistant_ss.str());
+                    chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
                     is_interacting = true;
                     printf("\n");
                 }
@@ -873,8 +871,8 @@ int main(int argc, char ** argv) {
                     }
 
                     std::string user_inp = params.conversation
-                        ? chat_add_and_format("user", buffer)
-                        : buffer;
+                        ? chat_add_and_format(model, chat_msgs, "user", buffer)
+                        : std::move(buffer);
                     // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
                     const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, params.conversation);

From a3dbfabe93ff16674b38e75ceb3287adb6e29914 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Mon, 24 Jun 2024 10:52:17 +0200
Subject: [PATCH 06/10] add llama_chat_format_example

---
 common/common.cpp          | 11 +++++++++++
 common/common.h            |  4 ++++
 examples/main/main.cpp     |  2 ++
 examples/server/server.cpp | 12 ++----------
 4 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b1de5615bfd9a..54e68accc2ccb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3020,6 +3020,17 @@ std::string llama_chat_format_single(const struct llama_model * model,
     return formatted;
 }
 
+std::string llama_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl) {
+    std::vector<llama_chat_msg> msgs = {
+        {"system",    "You are a helpful assistant"},
+        {"user",      "Hello"},
+        {"assistant", "Hi there"},
+        {"user",      "How are you?"},
+    };
+    return llama_chat_apply_template(model, tmpl, msgs, true);
+}
+
 //
 // KV cache utils
 //
diff --git a/common/common.h b/common/common.h
index 6a64bb22b855b..04cb8c30cd320 100644
--- a/common/common.h
+++ b/common/common.h
@@ -382,6 +382,10 @@ std::string llama_chat_format_single(const struct llama_model * model,
         const llama_chat_msg & new_msg,
         bool add_ass);
 
+// Returns an example of formatted chat
+std::string llama_chat_format_example(const struct llama_model * model,
+        const std::string & tmpl);
+
 //
 // KV cache utils
 //
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index e1f0a1a12fe67..f76c885f45818 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -224,6 +224,8 @@ int main(int argc, char ** argv) {
                 __func__, n_ctx_train, n_ctx);
     }
 
+    LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+
     // print system information
     {
         LOG_TEE("\n");
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index f9a86961f9c8e..3aad57284e30a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2606,17 +2606,9 @@ int main(int argc, char ** argv) {
 
     // print sample chat example to make it clear which template is used
     {
-        json chat;
-        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
-        chat.push_back({{"role", "user"},      {"content", "Hello"}});
-        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
-        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
-
-        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
-
         LOG_INFO("chat template", {
-            {"chat_example", chat_example},
-            {"built_in", params.chat_template.empty()},
+            {"chat_example", llama_chat_format_example(model)},
+            {"built_in",     params.chat_template.empty()},
         });
     }
 

From a1e9520995599e6686d78f854396ad55f1e46ee2 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Mon, 24 Jun 2024 10:56:55 +0200
Subject: [PATCH 07/10] fix server

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3aad57284e30a..ae768097baa0e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2607,7 +2607,7 @@ int main(int argc, char ** argv) {
     // print sample chat example to make it clear which template is used
     {
         LOG_INFO("chat template", {
-            {"chat_example", llama_chat_format_example(model)},
+            {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
             {"built_in",     params.chat_template.empty()},
         });
     }

From 7a7650231a74768873769de5c935b3d25f54eab7 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Mon, 24 Jun 2024 10:57:47 +0200
Subject: [PATCH 08/10] code style

---
 examples/main/main.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index f76c885f45818..510c690430146 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -31,12 +31,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static llama_context                 ** g_ctx;
-static llama_model                   ** g_model;
-static gpt_params                     * g_params;
-static std::vector<llama_token>       * g_input_tokens;
-static std::ostringstream             * g_output_ss;
-static std::vector<llama_token>       * g_output_tokens;
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static gpt_params               * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
 static bool file_exists(const std::string & path) {

From a28e70fde859b017c979682a8108edc37171ee66 Mon Sep 17 00:00:00 2001
From: ngxson <thichthat@gmail.com>
Date: Mon, 24 Jun 2024 11:07:08 +0200
Subject: [PATCH 09/10] code style

---
 examples/main/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 510c690430146..3a26d022025eb 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -123,7 +123,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
         model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
     return formatted;
-};
+}
 
 int main(int argc, char ** argv) {
     gpt_params params;

From 895bb2a697d96d8ff836697ccd1428e2c9ced3f4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 25 Jun 2024 10:50:25 +0200
Subject: [PATCH 10/10] Update examples/main/main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/main/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 3a26d022025eb..cfaf6a6e8ba4a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -873,7 +873,7 @@ int main(int argc, char ** argv) {
                     }
 
                     std::string user_inp = params.conversation
-                        ? chat_add_and_format(model, chat_msgs, "user", buffer)
+                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                         : std::move(buffer);
                     // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);