Skip to content

Commit 68d1d8f

Browse files
committed
server: bench: Init a bench scenario with K6
See #5827
1 parent 76e8688 commit 68d1d8f

File tree

2 files changed

+148
-0
lines changed

2 files changed

+148
-0
lines changed

examples/server/bench/README.md

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
### Server benchmark tools
2+
3+
Benchmark is using [k6](https://k6.io/).
4+
5+
##### Install k6 - ubuntu
6+
```shell
7+
snap install k6
8+
```
9+
10+
#### Downloading the ShareGPT dataset
11+
12+
```shell
13+
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
14+
```
15+
16+
#### Download a model
17+
Example for PHI-2
18+
19+
```shell
20+
../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
21+
```
22+
23+
#### Start the server
24+
The server must listen on `localhost:8080`.
25+
26+
Example:
27+
```shell
28+
server --host localhost --port 8080 \
29+
--model ggml-model-q4_0.gguf \
30+
--cont-batching \
31+
--metrics \
32+
--parallel 8 \
33+
--batch-size 512 \
34+
--ctx-size 4096 \
35+
--log-format text \
36+
-ngl 33
37+
```
38+
39+
#### Run the bench
40+
```shell
41+
k6 run script.js
42+
```
43+
44+
#### Change the number of concurrent user
45+
in the `script.js`, change the ramping period according to your number of slots.
46+
47+
#### Metrics
48+
49+
Following metrics are available:
50+
- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
51+
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
52+
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
53+
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
54+
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
55+
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
56+
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
57+
58+
The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
59+
60+
K6 metrics might be compared against [server metrics](../README.md), with:
61+
62+
```shell
63+
curl http://localhost:8080/metrics
64+
```

examples/server/bench/script.js

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import http from 'k6/http';
2+
import { check, sleep } from 'k6';
3+
import { SharedArray } from 'k6/data';
4+
import { Counter, Gauge, Rate } from 'k6/metrics';
5+
6+
const data = new SharedArray('conversations', function () {
7+
return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
8+
9+
// Filter out the conversations with less than 2 turns.
10+
.filter(data => data["conversations"].length >= 2)
11+
// Only keep the first two turns of each conversation.
12+
.map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
13+
});
14+
15+
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
16+
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
17+
18+
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
19+
20+
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
21+
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
22+
23+
const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
24+
const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
25+
26+
export const options = {
27+
thresholds: {
28+
llamacpp_completions_truncated_rate: [
29+
// more than 10% of truncated input will abort the test
30+
{ threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
31+
],
32+
},
33+
scenarios: {
34+
completions: {
35+
executor: 'ramping-vus',
36+
startVUs: 1,
37+
stages: [
38+
{duration: '1m', target: 8},
39+
{duration: '3m', target: 8},
40+
{duration: '1m', target: 0},
41+
],
42+
gracefulRampDown: '30s',
43+
},
44+
},
45+
};
46+
47+
export default function () {
48+
const conversation = data[0]
49+
const payload = {
50+
"messages": [
51+
{
52+
"role": "system",
53+
"content": conversation[0],
54+
},
55+
{
56+
"role": "user",
57+
"content": conversation[1],
58+
}
59+
],
60+
"model": "model",
61+
"stream": false,
62+
}
63+
let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
64+
headers: { 'Content-Type': 'application/json' },
65+
})
66+
67+
check(res, {'success completion': (r) => r.status === 200})
68+
69+
const completions = res.json()
70+
71+
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
72+
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
73+
74+
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
75+
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
76+
77+
llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
78+
79+
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
80+
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
81+
82+
83+
sleep(0.3)
84+
}

0 commit comments

Comments
 (0)