From f3404ec1963dcd82ab87a803a78f21eb0c1873d0 Mon Sep 17 00:00:00 2001
From: Chenyaaang <chenyangli@google.com>
Date: Thu, 24 Apr 2025 22:50:30 +0000
Subject: [PATCH] enforce user to input max_num_batched_tokens, max_num_seqs,
 max_model_len to reduce chance of perf degradation

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 examples/online_serving/chart-helm/values.yaml |  2 +-
 vllm/entrypoints/openai/cli_args.py            | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
index 9c48e7d061b..ddb4260a05d 100644
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -8,7 +8,7 @@ image:
   # -- Image tag
   tag: "latest"
   # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000", "--max-num-batched-tokens", "2048", "--max-num-seqs", "16", "--max-model-len", "2048"]
 
 # -- Container port
 containerPort: 8000
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index af546c3032a..b615ff543b3 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -289,6 +289,19 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-reasoning requires "
                         "--reasoning-parser")
 
+    # Ensure that --max-num-batched-tokens, --max-num-seqs, --max-model-len
+    # are passed within command on TPU.
+    from vllm.platforms import current_platform
+    if current_platform.is_tpu():
+        if args.max_num_batched_tokens is None:
+            raise ValueError("Requires --max-num-batched-tokens")
+
+        if args.max_num_seqs is None:
+            raise ValueError("Requires --max-num-seqs")
+
+        if args.max_model_len is None:
+            raise ValueError("Requires --max-model-len")
+
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(