Skip to content

Commit 438c2ca

Browse files
ggerganovFSSRepodamian0815jhen0409monatis
authored
server : parallel decoding and multimodal (#3677)
* implementing parallel decoding in server example * crash fixed * save dev progress * refactored sampling function * completion endpoint working * multiple client support * grammar + no stream completion * cached prompt support * chat.mjs support cached prompt + some fixes * server ui now support multiple clients * unused change reverted * fixed timings per slot * add context swap * add changes to README.md * llava multimodal integration * fixed tokens probs * add multimodal input - alfa * refactor code + remove unused comments + improved README.md * fix compilation errors with llvm * notify the user from server ui that multimodality is unavialable * some ci fixes * fix ci make build undefined ref errors * fix long prompt than ctx proposed in #3639 * fixed premature end due stop word * context shift fixed * fix llava implementation * sync README.md changes * readme change * update api like OpenAI * multimodal support enabled by default * fix make bui;d errors * fix multiple clients * fix zig build * new sampling API * latest changes of sampling API * server : coding-style normalization * server : coding-style normalization (part 2) * server : remove beam-search functionality * server : bug fix in ingest_images n_tokens is incremented internally by llama_batch_add * server : use refs + use llama_batch_clear() * server : snake case * server : minor sync * added thread safe pipeline * server : bach has to be allocated for n_parallel sequences * server : no need for atomic int - already using mutex * server : logs + minor code style * server : fix multibyte handle in partial response (#3706) * fix image load + view image in chat * make : silence stb warnings * clip : link to ggml, not to llama * server : fix switch fallthrough * server : fix crash in Debug on macOS (I have no idea why this fixes it!?) * server : refactor ctx_sampling init + n_ctx + names * server : bug fix for prompt caching * Do not save/load image_data to localStorage * editorconfig : new line in index.html * server : completion requests remember slot_id * Update readme to document multimodal in server * server : minor style * Update readme to document multimodal in server * server : hide ctx_sampling->prev behind API (#3696) * server : apply fix from #3722 * server : fix slot reuse * server : add comment about changing slot_state to bool --------- Co-authored-by: FSSRepo <go778sgt@gmail.com> Co-authored-by: Damian Stewart <d@damianstewart.com> Co-authored-by: Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Co-authored-by: Jhen-Jie Hong <iainst0409@gmail.com> Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
1 parent 9e70cc0 commit 438c2ca

File tree

12 files changed

+3927
-2897
lines changed

12 files changed

+3927
-2897
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
*.gcno
1111
*.gcda
1212
*.dot
13+
*.bat
1314
*.metallib
1415
.DS_Store
1516
.build/

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -605,8 +605,8 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
605605
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
606606
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
607607

608-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
609-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
608+
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
609+
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
610610

611611
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
612612
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

build.zig

+2-1
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
131131
const sampling = make.obj("sampling", "common/sampling.cpp");
132132
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
133133
const train = make.obj("train", "common/train.cpp");
134+
const clip = make.obj("clip", "examples/llava/clip.cpp");
134135

135136
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
136137
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
@@ -139,7 +140,7 @@ pub fn build(b: *std.build.Builder) !void {
139140
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
140141
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
141142

142-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
143+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
143144
if (server.target.isWindows()) {
144145
server.linkSystemLibrary("ws2_32");
145146
}

examples/llava/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
set(TARGET clip)
22
add_library(${TARGET} clip.cpp clip.h)
33
install(TARGETS ${TARGET} LIBRARY)
4-
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
55
target_compile_features(${TARGET} PRIVATE cxx_std_11)
66
if (NOT MSVC)
77
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h

examples/llava/clip.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -610,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
610610
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
611611
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
612612
for (int i = 0; i < 3; ++i) {
613-
new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
614-
new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
613+
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
614+
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
615615
}
616616

617617
if (verbosity >= 2) {

examples/server/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
66
target_compile_definitions(${TARGET} PRIVATE
77
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
88
)
9-
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
9+
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
1010
if (WIN32)
1111
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
1212
endif()

examples/server/README.md

+36
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ Command line options:
2424
- `--port`: Set the port to listen. Default: `8080`.
2525
- `--path`: path from which to serve static files (default examples/server/public)
2626
- `--embedding`: Enable embedding extraction, Default: disabled.
27+
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
28+
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
29+
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
30+
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
2731

2832
## Build
2933

@@ -158,6 +162,8 @@ node index.js
158162

159163
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
160164

165+
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
166+
161167
*Result JSON:*
162168

163169
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@@ -188,6 +194,12 @@ node index.js
188194

189195
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
190196

197+
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
198+
199+
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
200+
201+
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
202+
191203
- **POST** `/tokenize`: Tokenize a given text.
192204

193205
*Options:*
@@ -218,8 +230,32 @@ node index.js
218230

219231
It also accepts all the options of `/completion` except `stream` and `prompt`.
220232

233+
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
234+
221235
## More examples
222236

237+
### Change system prompt on runtime
238+
239+
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
240+
241+
`prompt`: Specify a context that you want all connecting clients to respect.
242+
243+
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
244+
245+
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
246+
247+
```json
248+
{
249+
"system_prompt": {
250+
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
251+
"anti_prompt": "User:",
252+
"assistant_name": "Assistant:"
253+
}
254+
}
255+
```
256+
257+
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
258+
223259
### Interactive mode
224260

225261
Check the sample in [chat.mjs](chat.mjs).

examples/server/api_like_OAI.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99

1010
app = Flask(__name__)
11+
slot_id = -1
1112

1213
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
1314
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
7778
if(is_present(body, "stop")): postData["stop"] += body["stop"]
7879
postData["n_keep"] = -1
7980
postData["stream"] = stream
80-
81+
postData["cache_prompt"] = True
82+
postData["slot_id"] = slot_id
8183
return postData
8284

8385
def make_resData(data, chat=False, promptToken=[]):
@@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
128130
}
129131
]
130132
}
133+
slot_id = data["slot_id"]
131134
if (chat):
132135
if (start):
133136
resData["choices"][0]["delta"] = {

examples/server/chat.mjs

+11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ const args = process.argv.slice(2);
77
const grammarJsonSchemaFile = args.find(
88
(_, index) => args[index - 1] === "--grammar-json-schema"
99
);
10+
11+
const no_cached_prompt = args.find(
12+
(_, index) => args[index - 1] === "--no-cache-prompt"
13+
) ?? "false";
14+
1015
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
1116

1217
// Example usage: function,arguments
@@ -30,6 +35,9 @@ if (grammarFile) {
3035
grammar = readFileSync(grammarFile, 'utf-8')
3136
}
3237

38+
// for cached prompt
39+
let slot_id = -1;
40+
3341
const API_URL = 'http://127.0.0.1:8080'
3442

3543
const chat = [
@@ -76,6 +84,8 @@ async function chat_completion(question) {
7684
top_p: 0.9,
7785
n_keep: n_keep,
7886
n_predict: 256,
87+
cache_prompt: no_cached_prompt === "false",
88+
slot_id: slot_id,
7989
stop: ["\n### Human:"], // stop completion after generating this
8090
grammar,
8191
stream: true,
@@ -92,6 +102,7 @@ async function chat_completion(question) {
92102
const t = Buffer.from(chunk).toString('utf8')
93103
if (t.startsWith('data: ')) {
94104
const message = JSON.parse(t.substring(6))
105+
slot_id = message.slot_id
95106
answer += message.content
96107
process.stdout.write(message.content)
97108
if (message.stop) {

0 commit comments

Comments
 (0)