Skip to content

Commit baaa5c5

Browse files
ochafikggerganov
authored andcommitted
server: fix tool-call of DeepSeek R1 Qwen, return reasoning_content (Command 7RB & DeepSeek R1) unless --reasoning-format none (ggml-org#11607)
* extract & return thoughts in reasoning_content field (unless --reasoning-format) for DeepSeek R1 & Command R7B * tool-calls: add deepseek r1 template (models/templates/llama-cpp-deepseek-r1.jinja) + hackommodate broken official template * tool-calls: accommodate variety of wrong tool call opening tags both R1 Qwen 32B and 7B distills like to spit out * server/oai: ensure content is null when there are tool calls, and reasoning_content appears before content for readability * tool-calls: add DeepSeek R1 Qwen distills to server/README.md & server tests Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
1 parent b990379 commit baaa5c5

17 files changed

+1024
-317
lines changed

common/arg.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -1982,6 +1982,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
19821982
params.use_jinja = true;
19831983
}
19841984
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
1985+
add_opt(common_arg(
1986+
{"--reasoning-format"}, "FORMAT",
1987+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
1988+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
1989+
"only supported for non-streamed responses",
1990+
[](common_params & params, const std::string & value) {
1991+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
1992+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
1993+
else { std::invalid_argument("invalid value"); }
1994+
}
1995+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
19851996
add_opt(common_arg(
19861997
{"--chat-template"}, "JINJA_TEMPLATE",
19871998
string_format(

common/chat.cpp

+219-100
Large diffs are not rendered by default.

common/chat.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct common_chat_inputs {
1919
bool stream;
2020
std::string grammar;
2121
bool add_generation_prompt = true;
22+
bool extract_reasoning = true;
2223
};
2324

2425
enum common_chat_format {
@@ -28,11 +29,13 @@ enum common_chat_format {
2829
COMMON_CHAT_FORMAT_LLAMA_3_X,
2930
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
3031
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
32+
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
3133
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
3234
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
3335
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
3436
COMMON_CHAT_FORMAT_HERMES_2_PRO,
3537
COMMON_CHAT_FORMAT_COMMAND_R7B,
38+
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
3639

3740
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
3841
};

common/common.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,11 @@ struct common_params_vocoder {
203203
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
204204
};
205205

206+
enum common_reasoning_format {
207+
COMMON_REASONING_FORMAT_NONE,
208+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
209+
};
210+
206211
struct common_params {
207212
int32_t n_predict = -1; // new tokens to predict
208213
int32_t n_ctx = 4096; // context size
@@ -347,6 +352,7 @@ struct common_params {
347352
std::string chat_template = ""; // NOLINT
348353
bool use_jinja = false; // NOLINT
349354
bool enable_chat_template = true;
355+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
350356

351357
std::vector<std::string> api_keys;
352358

@@ -624,7 +630,7 @@ struct common_chat_msg {
624630
std::string role;
625631
std::string content;
626632
std::vector<common_tool_call> tool_calls;
627-
std::string tool_plan = "";
633+
std::string reasoning_content = "";
628634
};
629635

630636
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid

common/sampling.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
151151

152152
lparams.no_perf = params.no_perf;
153153

154-
std::vector<const char *> trigger_words;
155-
trigger_words.reserve(params.grammar_trigger_words.size());
156-
for (const auto & str : params.grammar_trigger_words) {
157-
trigger_words.push_back(str.word.c_str());
158-
}
159-
160154
struct llama_sampler * grmr;
161155
if (params.grammar.compare(0, 11, "%llguidance") == 0) {
162156
#ifdef LLAMA_USE_LLGUIDANCE
@@ -165,6 +159,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
165159
GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
166160
#endif // LLAMA_USE_LLGUIDANCE
167161
} else {
162+
std::vector<const char *> trigger_words;
163+
trigger_words.reserve(params.grammar_trigger_words.size());
164+
for (const auto & str : params.grammar_trigger_words) {
165+
trigger_words.push_back(str.word.c_str());
166+
}
167+
168168
grmr = params.grammar_lazy
169169
? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170170
trigger_words.data(), trigger_words.size(),

examples/server/README.md

+251-50
Large diffs are not rendered by default.

examples/server/server.cpp

+16-13
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ struct slot_params {
173173
{"grammar_trigger_words", grammar_trigger_words},
174174
{"grammar_trigger_tokens", sampling.grammar_trigger_tokens},
175175
{"preserved_tokens", sampling.preserved_tokens},
176+
{"chat_format", common_chat_format_name(oaicompat_chat_format)},
176177
{"samplers", samplers},
177178
{"speculative.n_max", speculative.n_max},
178179
{"speculative.n_min", speculative.n_min},
@@ -724,9 +725,19 @@ struct server_task_result_cmpl_final : server_task_result {
724725
msg.content = content;
725726
}
726727

727-
json tool_calls;
728+
json message {
729+
{"role", "assistant"},
730+
};
731+
if (!msg.reasoning_content.empty()) {
732+
message["reasoning_content"] = msg.reasoning_content;
733+
}
734+
if (msg.content.empty() && !msg.tool_calls.empty()) {
735+
message["content"] = json();
736+
} else {
737+
message["content"] = msg.content;
738+
}
728739
if (!msg.tool_calls.empty()) {
729-
tool_calls = json::array();
740+
auto tool_calls = json::array();
730741
for (const auto & tc : msg.tool_calls) {
731742
tool_calls.push_back({
732743
{"type", "function"},
@@ -737,15 +748,7 @@ struct server_task_result_cmpl_final : server_task_result {
737748
{"id", tc.id},
738749
});
739750
}
740-
}
741-
742-
json message {
743-
{"content", msg.content},
744-
{"tool_calls", tool_calls},
745-
{"role", "assistant"},
746-
};
747-
if (!msg.tool_plan.empty()) {
748-
message["tool_plan"] = msg.tool_plan;
751+
message["tool_calls"] = tool_calls;
749752
}
750753

751754
json choice {
@@ -4060,7 +4063,7 @@ int main(int argc, char ** argv) {
40604063
}
40614064

40624065
auto body = json::parse(req.body);
4063-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4066+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
40644067

40654068
return handle_completions_impl(
40664069
SERVER_TASK_TYPE_COMPLETION,
@@ -4073,7 +4076,7 @@ int main(int argc, char ** argv) {
40734076
// same with handle_chat_completions, but without inference part
40744077
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
40754078
auto body = json::parse(req.body);
4076-
json data = oaicompat_completion_params_parse(body, params.use_jinja, ctx_server.chat_templates);
4079+
json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates);
40774080
res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
40784081
};
40794082

0 commit comments

Comments
 (0)