Skip to content

Commit b027c65

Browse files
[Perf]Optimize rotary_emb implementation to use Triton operator for improved inference performance
Signed-off-by: cynthieye <yexin93@qq.com> Co-authored-by: MagnetoWang <magnetowang@outlook.com>
2 parents 19637a7 + 7a0a9da commit b027c65

File tree

337 files changed

+11222
-5161
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

337 files changed

+11222
-5161
lines changed

.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ tasks:
44
- name: "gsm8k"
55
metrics:
66
- name: "exact_match,strict-match"
7-
value: 0.31
7+
value: 0.30
88
- name: "exact_match,flexible-extract"
9-
value: 0.47
9+
value: 0.465
1010
limit: 1319
1111
num_fewshot: 5

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import pytest
1717
import yaml
1818

19-
RTOL = 0.05
19+
RTOL = 0.08
2020
TEST_DATA_FILE = os.environ.get(
2121
"LM_EVAL_TEST_DATA_FILE",
2222
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

.buildkite/scripts/hardware_ci/run-amd-test.sh

+7
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
9898
--ignore=kernels/test_machete_mm.py \
9999
--ignore=kernels/test_mha_attn.py \
100100
--ignore=kernels/test_block_fp8.py \
101+
--ignore=kernels/test_cutlass_moe.py \
102+
--ignore=kernels/test_mamba_ssm_ssd.py \
103+
--ignore=kernels/test_attention.py \
104+
--ignore=kernels/test_block_int8.py \
105+
--ignore=kernels/test_fused_quant_layernorm.py \
106+
--ignore=kernels/test_int8_kernel.py \
107+
--ignore=kernels/test_triton_moe_ptpc_fp8.py \
101108
--ignore=kernels/test_permute_cols.py"
102109
fi
103110

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

+11-4
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,30 @@
55
set -ex
66

77
# Setup cleanup
8-
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
8+
remove_docker_container() {
9+
if [[ -n "$container_id" ]]; then
10+
podman rm -f "$container_id" || true
11+
fi
12+
podman system prune -f
13+
}
914
trap remove_docker_container EXIT
1015
remove_docker_container
1116

1217
# Try building the docker image
1318
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
1419

1520
# Run the image
16-
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
21+
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
1722

1823
function cpu_tests() {
1924

2025
# offline inference
21-
podman exec cpu-test-ubi9-ppc bash -c "
26+
podman exec -it "$container_id" bash -c "
2227
set -e
2328
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
2429

2530
# Run basic model test
26-
podman exec cpu-test-ubi9-ppc bash -c "
31+
podman exec -it "$container_id" bash -c "
2732
set -e
2833
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
2934
pip install sentence-transformers datamodel_code_generator
@@ -33,6 +38,8 @@ function cpu_tests() {
3338
}
3439

3540
# All of CPU tests are expected to be finished less than 40 mins.
41+
42+
export container_id
3643
export -f cpu_tests
3744
timeout 40m bash -c cpu_tests
3845

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

+7-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ source /etc/environment
1717
docker run --privileged --net host --shm-size=16G -it \
1818
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
20-
&& python3 -m pip install pytest tpu-info \
20+
&& python3 -m pip install pytest pytest-asyncio tpu-info \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
22+
&& export VLLM_XLA_CACHE_PATH= \
2223
&& export VLLM_USE_V1=1 \
2324
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
2425
&& echo HARDWARE \
@@ -42,7 +43,11 @@ docker run --privileged --net host --shm-size=16G -it \
4243
&& echo TEST_8 \
4344
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
4445
&& echo TEST_9 \
45-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
46+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
47+
&& echo TEST_10 \
48+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
49+
&& echo TEST_11 \
50+
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
4651

4752

4853
# TODO: This test fails because it uses RANDOM_SEED sampling

.buildkite/test-pipeline.yaml

+41-5
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
# Documentation
99
# label(str): the name of the test. emoji allowed.
1010
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
11+
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
1112
# fast_check_only(bool): run this test on fastcheck pipeline only
1213
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
1314
# command(str): the single command to run for tests. incompatible with commands.
@@ -70,6 +71,7 @@ steps:
7071
- label: Basic Correctness Test # 30min
7172
#mirror_hardwares: [amd]
7273
fast_check: true
74+
torch_nightly: true
7375
source_file_dependencies:
7476
- vllm/
7577
- tests/basic_correctness/test_basic_correctness
@@ -104,6 +106,7 @@ steps:
104106
- label: Entrypoints Test # 40min
105107
working_dir: "/vllm-workspace/tests"
106108
fast_check: true
109+
torch_nightly: true
107110
#mirror_hardwares: [amd]
108111
source_file_dependencies:
109112
- vllm/
@@ -205,6 +208,8 @@ steps:
205208
- pytest -v -s v1/sample
206209
- pytest -v -s v1/worker
207210
- pytest -v -s v1/structured_output
211+
- pytest -v -s v1/spec_decode
212+
- pytest -v -s v1/test_serial_utils.py
208213
- pytest -v -s v1/test_stats.py
209214
- pytest -v -s v1/test_utils.py
210215
- pytest -v -s v1/test_oracle.py
@@ -312,15 +317,46 @@ steps:
312317
commands:
313318
- pytest -v -s compile/test_full_graph.py
314319

315-
- label: Kernels Test %N # 1h each
316-
# mirror_hardwares: [amd]
320+
- label: Kernels Core Operation Test
317321
source_file_dependencies:
318322
- csrc/
323+
- tests/kernels/core
324+
commands:
325+
- pytest -v -s kernels/core
326+
327+
- label: Kernels Attention Test %N
328+
source_file_dependencies:
329+
- csrc/attention/
319330
- vllm/attention
320-
- tests/kernels
331+
- vllm/v1/attention
332+
- tests/kernels/attention
321333
commands:
322-
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
323-
parallelism: 4
334+
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
335+
parallelism: 2
336+
337+
- label: Kernels Quantization Test %N
338+
source_file_dependencies:
339+
- csrc/quantization/
340+
- vllm/model_executor/layers/quantization
341+
- tests/kernels/quantization
342+
commands:
343+
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
344+
parallelism: 2
345+
346+
- label: Kernels MoE Test
347+
source_file_dependencies:
348+
- csrc/moe/
349+
- tests/kernels/moe
350+
- vllm/model_executor/layers/fused_moe/
351+
commands:
352+
- pytest -v -s kernels/moe
353+
354+
- label: Kernels Mamba Test
355+
source_file_dependencies:
356+
- csrc/mamba/
357+
- tests/kernels/mamba
358+
commands:
359+
- pytest -v -s kernels/mamba
324360

325361
- label: Tensorizer Test # 11min
326362
# mirror_hardwares: [amd]

.github/mergify.yml

+32-2
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,19 @@ pull_request_rules:
5555
description: Automatically apply structured-output label
5656
conditions:
5757
- or:
58+
- files~=^benchmarks/structured_schemas/
59+
- files=benchmarks/benchmark_serving_structured_output.py
60+
- files=benchmarks/run_structured_output_benchmark.sh
61+
- files=docs/source/features/structured_outputs.md
62+
- files=examples/offline_inference/structured_outputs.py
63+
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
64+
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
5865
- files~=^vllm/model_executor/guided_decoding/
5966
- files=tests/model_executor/test_guided_processors.py
6067
- files=tests/entrypoints/llm/test_guided_generate.py
61-
- files=benchmarks/benchmark_serving_guided.py
62-
- files=benchmarks/benchmark_guided.py
68+
- files~=^tests/v1/structured_output/
69+
- files=tests/v1/entrypoints/llm/test_guided_generate.py
70+
- files~=^vllm/v1/structured_output/
6371
actions:
6472
label:
6573
add:
@@ -118,6 +126,28 @@ pull_request_rules:
118126
remove:
119127
- tpu
120128

129+
- name: label-tool-calling
130+
description: Automatically add tool-calling label
131+
conditions:
132+
- or:
133+
- files~=^tests/tool_use/
134+
- files~=^tests/mistral_tool_use/
135+
- files~=^tests/entrypoints/openai/tool_parsers/
136+
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
137+
- files~=^vllm/entrypoints/openai/tool_parsers/
138+
- files=docs/source/features/tool_calling.md
139+
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
140+
- files=docs/source/getting_started/examples/chat_with_tools.md
141+
- files~=^examples/tool_chat_*
142+
- files=examples/offline_inference/chat_with_tools.py
143+
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
144+
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
145+
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
146+
actions:
147+
label:
148+
add:
149+
- tool-calling
150+
121151
- name: ping author on conflicts and add 'needs-rebase' label
122152
conditions:
123153
- conflict

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
678678
#
679679
set(VLLM_ROCM_EXT_SRC
680680
"csrc/rocm/torch_bindings.cpp"
681+
"csrc/rocm/skinny_gemms.cu"
681682
"csrc/rocm/attention.cu")
682683

683684
define_gpu_extension_target(

benchmarks/benchmark_serving_structured_output.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
except ImportError:
5252
from argparse import ArgumentParser as FlexibleArgumentParser
5353

54-
from vllm.v1.structured_output.utils import (
54+
from vllm.v1.structured_output.backend_xgrammar import (
5555
has_xgrammar_unsupported_json_features)
5656

5757
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -150,17 +150,17 @@ def get_schema(index: int):
150150

151151
elif args.dataset == "grammar":
152152
schema = """
153-
?start: select_statement
153+
root ::= select_statement
154154
155-
?select_statement: "SELECT " column_list " FROM " table_name
155+
select_statement ::= "SELECT " column " from " table " where " condition
156156
157-
?column_list: column_name ("," column_name)*
157+
column ::= "col_1 " | "col_2 "
158158
159-
?table_name: identifier
159+
table ::= "table_1 " | "table_2 "
160160
161-
?column_name: identifier
161+
condition ::= column "= " number
162162
163-
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
163+
number ::= "1 " | "2 "
164164
"""
165165
prompt = "Generate an SQL query to show the 'username' \
166166
and 'email' from the 'users' table."

benchmarks/benchmark_throughput.py

+7
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,13 @@ def validate_args(args):
523523
raise ValueError(
524524
"Tokenizer must be the same as the model for MII backend.")
525525

526+
# --data-parallel is not supported currently.
527+
# https://github.com/vllm-project/vllm/issues/16222
528+
if args.data_parallel_size > 1:
529+
raise ValueError(
530+
"Data parallel is not supported in offline benchmark, \
531+
please use benchmark serving instead")
532+
526533

527534
if __name__ == "__main__":
528535
parser = FlexibleArgumentParser(description="Benchmark the throughput.")

0 commit comments

Comments
 (0)