Skip to content

Commit 7e64bfe

Browse files
committed
refactor code + remove unused comments + improved README.md
1 parent 9f72b44 commit 7e64bfe

File tree

2 files changed

+102
-55
lines changed

2 files changed

+102
-55
lines changed

examples/server/README.md

+27-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Command line options:
2626
- `--embedding`: Enable embedding extraction, Default: disabled.
2727
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
2828
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
29+
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
30+
-
2931

3032
## Build
3133

@@ -164,7 +166,7 @@ node index.js
164166

165167
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
166168

167-
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications.
169+
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
168170

169171
- **POST** `/tokenize`: Tokenize a given text.
170172

@@ -196,8 +198,32 @@ node index.js
196198

197199
It also accepts all the options of `/completion` except `stream` and `prompt`.
198200

201+
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
202+
199203
## More examples
200204

205+
### Change system prompt on runtime
206+
207+
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
208+
209+
`prompt`: Specify a context that you want all connecting clients to respect.
210+
211+
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
212+
213+
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
214+
215+
```json
216+
{
217+
"system_prompt": {
218+
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
219+
"anti_prompt": "User:",
220+
"assistant_name": "Assistant:"
221+
}
222+
}
223+
```
224+
225+
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
226+
201227
### Interactive mode
202228

203229
Check the sample in [chat.mjs](chat.mjs).

examples/server/server.cpp

+75-54
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#include "build-info.h"
44
#include "grammar-parser.h"
55

6-
#define SERVER_MULTIMODAL_SUPPORT
6+
// #define SERVER_MULTIMODAL_SUPPORT
77

88
#ifdef SERVER_MULTIMODAL_SUPPORT
99
#include "../llava/clip.h"
@@ -44,11 +44,6 @@ struct server_params
4444
int32_t write_timeout = 600;
4545
};
4646

47-
// struct beam_search_callback_data {
48-
// llama_server_context* ctx;
49-
// llama_client_slot* slot;
50-
// };
51-
5247
static bool server_verbose = false;
5348

5449
#if SERVER_VERBOSE != 1
@@ -660,6 +655,7 @@ struct llama_server_context
660655
}
661656
waitAllAreIdle();
662657
all_slots_are_idle = true;
658+
663659
// wait until system prompt load
664660
update_system_prompt = true;
665661
while(update_system_prompt) {
@@ -672,7 +668,11 @@ struct llama_server_context
672668
system_prompt = sys_props.value("prompt", "");
673669
user_name = sys_props.value("anti_prompt", "");
674670
assistant_name = sys_props.value("assistant_name", "");
675-
notifySystemPromptChanged();
671+
if(slots.size() > 0) {
672+
notifySystemPromptChanged();
673+
} else {
674+
update_system_prompt = true;
675+
}
676676
}
677677

678678
void waitAllAreIdle() {
@@ -813,6 +813,7 @@ struct llama_server_context
813813
});
814814
return has_next_token; // continue
815815
}
816+
816817
#ifdef SERVER_MULTIMODAL_SUPPORT
817818
bool processImages(llama_client_slot &slot) {
818819
for(slot_image &img : slot.images) {
@@ -1204,6 +1205,11 @@ struct llama_server_context
12041205
}
12051206
};
12061207

1208+
struct server_beam_search_callback_data {
1209+
llama_context * ctx;
1210+
llama_client_slot * slot;
1211+
};
1212+
12071213
static void server_print_usage(const char *argv0, const gpt_params &params,
12081214
const server_params &sparams)
12091215
{
@@ -1251,14 +1257,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
12511257
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
12521258
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
12531259
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
1260+
printf(" -spf FNAME, --system-prompt-file FNAME\n");
1261+
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
12541262
#ifdef SERVER_MULTIMODAL_SUPPORT
12551263
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
12561264
#endif
12571265
printf("\n");
12581266
}
12591267

12601268
static void server_params_parse(int argc, char **argv, server_params &sparams,
1261-
gpt_params &params)
1269+
gpt_params &params, llama_server_context& llama)
12621270
{
12631271
gpt_params default_params;
12641272
server_params default_sparams;
@@ -1523,6 +1531,26 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
15231531
break;
15241532
}
15251533
params.n_predict = std::stoi(argv[i]);
1534+
} else if (arg == "-spf" || arg == "--system-prompt-file")
1535+
{
1536+
if (++i >= argc)
1537+
{
1538+
invalid_param = true;
1539+
break;
1540+
}
1541+
std::ifstream file(argv[i]);
1542+
if (!file) {
1543+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
1544+
invalid_param = true;
1545+
break;
1546+
}
1547+
std::string systm_content = "";
1548+
std::copy(
1549+
std::istreambuf_iterator<char>(file),
1550+
std::istreambuf_iterator<char>(),
1551+
std::back_inserter(systm_content)
1552+
);
1553+
llama.processSystemPromptData(json::parse(systm_content));
15261554
}
15271555
#ifdef SERVER_MULTIMODAL_SUPPORT
15281556
else if(arg == "--mmproj") {
@@ -1864,8 +1892,8 @@ static void log_server_request(const Request &req, const Response &res)
18641892
});
18651893
}
18661894

1867-
static bool is_at_eob(llama_server_context * server_context, const llama_token *tokens, const size_t n_tokens) {
1868-
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context->ctx);
1895+
static bool is_at_eob(const server_beam_search_callback_data & server_context, const llama_token *tokens, const size_t n_tokens) {
1896+
return n_tokens && tokens[n_tokens - 1] == llama_token_eos(server_context.ctx);
18691897
}
18701898

18711899
// Function matching type llama_beam_search_callback_fn_t.
@@ -1875,34 +1903,34 @@ static bool is_at_eob(llama_server_context * server_context, const llama_token *
18751903
// This is also called when the stop condition is met.
18761904
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
18771905

1878-
// AVOID HEADACHES unnecessaries
1879-
1880-
// static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
1881-
// auto & llama = *static_cast<beam_search_callback_data*>(callback_data);
1882-
// // Mark beams as EOS as needed.
1883-
// for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
1884-
// llama_beam_view& beam_view = beams_state.beam_views[i];
1885-
// if (!beam_view.eob && is_at_eob(llama.ctx, beam_view.tokens, beam_view.n_tokens)) {
1886-
// beam_view.eob = true;
1887-
// }
1888-
// }
1889-
// printf(","); // Show progress
1890-
// if (const size_t n = beams_state.common_prefix_length) {
1891-
// llama.slot->generated_token_probs.resize(llama.slot->generated_token_probs.size() + n);
1892-
// assert(0u < beams_state.n_beams);
1893-
// const llama_token * tokens = beams_state.beam_views[0].tokens;
1894-
// const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
1895-
// std::transform(tokens, tokens + n, llama.slot->generated_token_probs.end() - n, map);
1896-
// printf("%zu", n);
1897-
// }
1898-
// fflush(stdout);
1899-
// #if 0 // DEBUG: print current beams for this iteration
1900-
// std::cout << "\n\nCurrent beams:\n";
1901-
// for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
1902-
// std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
1903-
// }
1904-
// #endif
1905-
// }
1906+
// NO TESTED after PR #3589
1907+
1908+
static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
1909+
auto & llama = *static_cast<server_beam_search_callback_data*>(callback_data);
1910+
// Mark beams as EOS as needed.
1911+
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
1912+
llama_beam_view& beam_view = beams_state.beam_views[i];
1913+
if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
1914+
beam_view.eob = true;
1915+
}
1916+
}
1917+
printf(","); // Show progress
1918+
if (const size_t n = beams_state.common_prefix_length) {
1919+
llama.slot->generated_token_probs.resize(llama.slot->generated_token_probs.size() + n);
1920+
assert(0u < beams_state.n_beams);
1921+
const llama_token * tokens = beams_state.beam_views[0].tokens;
1922+
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
1923+
std::transform(tokens, tokens + n, llama.slot->generated_token_probs.end() - n, map);
1924+
printf("%zu", n);
1925+
}
1926+
fflush(stdout);
1927+
#if 0 // DEBUG: print current beams for this iteration
1928+
std::cout << "\n\nCurrent beams:\n";
1929+
for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
1930+
std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
1931+
}
1932+
#endif
1933+
}
19061934

19071935
struct token_translator {
19081936
llama_context * ctx;
@@ -1933,7 +1961,7 @@ int main(int argc, char **argv)
19331961
// struct that contains llama context and inference
19341962
llama_server_context llama;
19351963

1936-
server_params_parse(argc, argv, sparams, params);
1964+
server_params_parse(argc, argv, sparams, params, llama);
19371965

19381966
if (params.model_alias == "unknown")
19391967
{
@@ -2015,8 +2043,6 @@ int main(int argc, char **argv)
20152043
llama.processSystemPromptData(data["system_prompt"]);
20162044
}
20172045

2018-
// llama_reset_timings(llama.ctx);
2019-
20202046
slot->reset();
20212047

20222048
parse_options_completion(data, slot, llama);
@@ -2030,14 +2056,14 @@ int main(int argc, char **argv)
20302056
if (!slot->params.stream) {
20312057
std::string completion_text = "";
20322058
if (llama.params.n_beams) {
2033-
// // Fill llama.generated_token_probs vector with final beam.
2034-
// beam_search_callback_data data_;
2035-
// data_.slot = slot;
2036-
// data_.ctx = &llama;
2037-
// llama_beam_search(llama.ctx, beam_search_callback, &data_, llama.params.n_beams,
2038-
// slot->n_past, llama.params.n_predict);
2039-
// // Translate llama.generated_token_probs to llama.generated_text.
2040-
// append_to_generated_text_from_generated_token_probs(llama, slot);
2059+
// Fill llama.generated_token_probs vector with final beam.
2060+
server_beam_search_callback_data data_beam;
2061+
data_beam.slot = slot;
2062+
data_beam.ctx = llama.ctx;
2063+
llama_beam_search(llama.ctx, beam_search_callback, &data_beam, llama.params.n_beams,
2064+
slot->n_past, llama.params.n_predict);
2065+
// Translate llama.generated_token_probs to llama.generated_text.
2066+
append_to_generated_text_from_generated_token_probs(llama, slot);
20412067
} else {
20422068
while (slot->isProcessing()) {
20432069
if(slot->hasNewToken()) {
@@ -2055,8 +2081,6 @@ int main(int argc, char **argv)
20552081
}
20562082

20572083
const json data = format_final_response(llama, slot, completion_text, probs);
2058-
2059-
//llama_print_timings(llama.ctx);
20602084
slot->release();
20612085
res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
20622086
"application/json");
@@ -2138,8 +2162,6 @@ int main(int argc, char **argv)
21382162
llama.processSystemPromptData(data["system_prompt"]);
21392163
}
21402164

2141-
// llama_reset_timings(llama.ctx);
2142-
21432165
slot->reset();
21442166
slot->infill = true;
21452167

@@ -2167,7 +2189,6 @@ int main(int argc, char **argv)
21672189
}
21682190

21692191
const json data = format_final_response(llama, slot, completion_text, probs);
2170-
//llama_print_timings(llama.ctx);
21712192
res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
21722193
"application/json");
21732194
} else {

0 commit comments

Comments
 (0)