From f3404ec1963dcd82ab87a803a78f21eb0c1873d0 Mon Sep 17 00:00:00 2001 From: Chenyaaang Date: Thu, 24 Apr 2025 22:50:30 +0000 Subject: [PATCH] enforce user to input max_num_batched_tokens, max_num_seqs, max_model_len to reduce chance of perf degradation Signed-off-by: Chenyaaang --- examples/online_serving/chart-helm/values.yaml | 2 +- vllm/entrypoints/openai/cli_args.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 9c48e7d061b..ddb4260a05d 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000", "--max-num-batched-tokens", "2048", "--max-num-seqs", "16", "--max-model-len", "2048"] # -- Container port containerPort: 8000 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index af546c3032a..b615ff543b3 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -289,6 +289,19 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-reasoning requires " "--reasoning-parser") + # Ensure that --max-num-batched-tokens, --max-num-seqs, --max-model-len + # are passed within command on TPU. + from vllm.platforms import current_platform + if current_platform.is_tpu(): + if args.max_num_batched_tokens is None: + raise ValueError("Requires --max-num-batched-tokens") + + if args.max_num_seqs is None: + raise ValueError("Requires --max-num-seqs") + + if args.max_model_len is None: + raise ValueError("Requires --max-model-len") + def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser(