1
+ import http from 'k6/http' ;
2
+ import { check , sleep } from 'k6' ;
3
+ import { SharedArray } from 'k6/data' ;
4
+ import { Counter , Gauge , Rate } from 'k6/metrics' ;
5
+
6
+ const data = new SharedArray ( 'conversations' , function ( ) {
7
+ return JSON . parse ( open ( './ShareGPT_V3_unfiltered_cleaned_split.json' ) )
8
+
9
+ // Filter out the conversations with less than 2 turns.
10
+ . filter ( data => data [ "conversations" ] . length >= 2 )
11
+ // Only keep the first two turns of each conversation.
12
+ . map ( data => Array ( data [ "conversations" ] [ 0 ] [ "value" ] , data [ "conversations" ] [ 1 ] [ "value" ] ) ) ;
13
+ } ) ;
14
+
15
+ const llamacpp_prompt_tokens = new Gauge ( 'llamacpp_prompt_tokens' ) ;
16
+ const llamacpp_completion_tokens = new Gauge ( 'llamacpp_completion_tokens' ) ;
17
+
18
+ const llamacpp_completions_tokens_seconds = new Gauge ( 'llamacpp_completions_tokens_seconds' ) ;
19
+
20
+ const llamacpp_prompt_tokens_total_counter = new Counter ( 'llamacpp_prompt_tokens_total_counter' ) ;
21
+ const llamacpp_completion_tokens_total_counter = new Counter ( 'llamacpp_completion_tokens_total_counter' ) ;
22
+
23
+ const llamacpp_completions_truncated_rate = new Rate ( 'llamacpp_completions_truncated_rate' ) ;
24
+ const llamacpp_completions_stop_rate = new Rate ( 'llamacpp_completions_stop_rate' ) ;
25
+
26
+ export const options = {
27
+ thresholds : {
28
+ llamacpp_completions_truncated_rate : [
29
+ // more than 10% of truncated input will abort the test
30
+ { threshold : 'rate < 0.1' , abortOnFail : true , delayAbortEval : '1m' } ,
31
+ ] ,
32
+ } ,
33
+ scenarios : {
34
+ completions : {
35
+ executor : 'ramping-vus' ,
36
+ startVUs : 1 ,
37
+ stages : [
38
+ { duration : '1m' , target : 8 } ,
39
+ { duration : '3m' , target : 8 } ,
40
+ { duration : '1m' , target : 0 } ,
41
+ ] ,
42
+ gracefulRampDown : '30s' ,
43
+ } ,
44
+ } ,
45
+ } ;
46
+
47
+ export default function ( ) {
48
+ const conversation = data [ 0 ]
49
+ const payload = {
50
+ "messages" : [
51
+ {
52
+ "role" : "system" ,
53
+ "content" : conversation [ 0 ] ,
54
+ } ,
55
+ {
56
+ "role" : "user" ,
57
+ "content" : conversation [ 1 ] ,
58
+ }
59
+ ] ,
60
+ "model" : "model" ,
61
+ "stream" : false ,
62
+ }
63
+ let res = http . post ( 'http://localhost:8080/v1/chat/completions' , JSON . stringify ( payload ) , {
64
+ headers : { 'Content-Type' : 'application/json' } ,
65
+ } )
66
+
67
+ check ( res , { 'success completion' : ( r ) => r . status === 200 } )
68
+
69
+ const completions = res . json ( )
70
+
71
+ llamacpp_prompt_tokens . add ( completions . usage . prompt_tokens )
72
+ llamacpp_prompt_tokens_total_counter . add ( completions . usage . prompt_tokens )
73
+
74
+ llamacpp_completion_tokens . add ( completions . usage . completion_tokens )
75
+ llamacpp_completion_tokens_total_counter . add ( completions . usage . completion_tokens )
76
+
77
+ llamacpp_completions_tokens_seconds . add ( completions . usage . completion_tokens / res . timings . duration * 1e3 )
78
+
79
+ llamacpp_completions_truncated_rate . add ( completions . choices [ 0 ] . finish_reason === 'length' )
80
+ llamacpp_completions_stop_rate . add ( completions . choices [ 0 ] . finish_reason === 'stop' )
81
+
82
+
83
+ sleep ( 0.3 )
84
+ }
0 commit comments