Skip to content

Commit d803786

Browse files
authored
[V1][Bugfix]: vllm v1 verison metric num_gpu_blocks is None (#15755)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
1 parent 1534d38 commit d803786

File tree

4 files changed

+37
-11
lines changed

4 files changed

+37
-11
lines changed

vllm/v1/engine/async_llm.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ def __init__(
120120
executor_class=executor_class,
121121
log_stats=self.log_stats,
122122
)
123-
123+
for stat_logger in self.stat_loggers[0]:
124+
stat_logger.log_engine_initialized()
124125
self.output_handler: Optional[asyncio.Task] = None
125126
try:
126127
# Start output handler eagerly if we are in the asyncio eventloop.

vllm/v1/engine/core.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
2+
import json
23
import os
34
import queue
45
import signal
@@ -116,6 +117,7 @@ def __init__(self,
116117
logger.info("Batch queue is enabled with size %d",
117118
self.batch_queue_size)
118119
self.batch_queue = queue.Queue(self.batch_queue_size)
120+
self.vllm_config = vllm_config
119121

120122
def _initialize_kv_caches(
121123
self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
@@ -507,7 +509,12 @@ def process_input_socket(self, input_path: str, engine_index: int):
507509
bind=False) as socket:
508510

509511
# Send ready message to front-end once input socket is connected.
510-
socket.send(b'READY')
512+
message_dict = {
513+
'type': 'READY',
514+
'num_gpu_blocks': self.vllm_config.cache_config.num_gpu_blocks,
515+
}
516+
message = json.dumps(message_dict).encode('utf-8')
517+
socket.send(message)
511518

512519
while True:
513520
# (RequestType, RequestData)

vllm/v1/engine/core_client.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
import asyncio
33
import contextlib
4+
import json
45
import queue
56
import uuid
67
import weakref
@@ -362,6 +363,7 @@ def __init__(
362363
executor_class: type[Executor],
363364
log_stats: bool,
364365
):
366+
self.vllm_config = vllm_config
365367
# Serialization setup.
366368
self.encoder = MsgpackEncoder()
367369
self.decoder = MsgpackDecoder(EngineCoreOutputs)
@@ -430,14 +432,19 @@ def _wait_for_engine_startup(self):
430432
raise RuntimeError("Engine core initialization failed. "
431433
"See root cause above.")
432434

433-
eng_id_bytes, msg = sync_input_socket.recv_multipart()
435+
eng_id_bytes, data = sync_input_socket.recv_multipart()
434436
eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
435437
if eng_id not in identities:
436438
raise RuntimeError(f"Unexpected or duplicate engine: {eng_id}")
437-
if msg != b'READY':
438-
raise RuntimeError(f"Engine {eng_id} failed: {msg.decode()}")
439+
message_dict = json.loads(data.decode('utf-8'))
440+
if message_dict['type'] != 'READY':
441+
raise RuntimeError(f"Engine {eng_id} failed: {data.decode()}")
439442
logger.info("Core engine process %d ready.", eng_id)
440443
identities.discard(eng_id)
444+
# Setup KV cache config with initialization state from
445+
# engine core process.
446+
self.vllm_config.cache_config.num_gpu_blocks = message_dict[
447+
'num_gpu_blocks']
441448

442449
def _init_core_engines(
443450
self,

vllm/v1/metrics/loggers.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def record(self, scheduler_stats: SchedulerStats,
3939
iteration_stats: Optional[IterationStats]):
4040
...
4141

42+
@abstractmethod
43+
def log_engine_initialized(self):
44+
...
45+
4246
def log(self): # noqa
4347
pass
4448

@@ -47,6 +51,7 @@ class LoggingStatLogger(StatLoggerBase):
4751

4852
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
4953
self.engine_index = engine_index
54+
self.vllm_config = vllm_config
5055
self._reset(time.monotonic())
5156
self.last_scheduler_stats = SchedulerStats()
5257
# Prefix cache metrics. This cannot be reset.
@@ -127,12 +132,19 @@ def log(self):
127132
if scheduler_stats.spec_decoding_stats is not None:
128133
self.spec_decoding_logging.log(log_fn=log_fn)
129134

135+
def log_engine_initialized(self):
136+
logger.info(
137+
"vllm cache_config_info with initialization " \
138+
"after num_gpu_blocks is: %d",
139+
self.vllm_config.cache_config.num_gpu_blocks)
140+
130141

131142
class PrometheusStatLogger(StatLoggerBase):
132143

133144
def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
134145
self._unregister_vllm_metrics()
135-
146+
self.vllm_config = vllm_config
147+
self.engine_index = engine_index
136148
# Use this flag to hide metrics that were deprecated in
137149
# a previous release and which will be removed future
138150
self.show_hidden_metrics = \
@@ -342,13 +354,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
342354
self.labelname_running_lora_adapters,
343355
])
344356

345-
#
346-
# Cache config info metric
347-
#
348-
self.log_metrics_info("cache_config", vllm_config.cache_config)
349-
350357
def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
351358
metrics_info = config_obj.metrics_info()
359+
metrics_info["engine"] = self.engine_index
352360

353361
name, documentation = None, None
354362
if type == "cache_config":
@@ -442,6 +450,9 @@ def _unregister_vllm_metrics():
442450
if hasattr(collector, "_name") and "vllm" in collector._name:
443451
prometheus_client.REGISTRY.unregister(collector)
444452

453+
def log_engine_initialized(self):
454+
self.log_metrics_info("cache_config", self.vllm_config.cache_config)
455+
445456

446457
def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
447458
"""

0 commit comments

Comments
 (0)