Skip to content

remove cache_transceiver_prealloc_size #4153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 12, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 2 additions & 32 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import math
import os
import random
from collections.abc import Iterable

Expand All @@ -18,8 +17,7 @@
from ..speculative import get_num_spec_layers, get_spec_decoder
from .decoder import (EarlyStopDecoder, TorchDecoder, TorchStarAttentionDecoder,
TRTLLMDecoder)
from .kv_cache_transceiver import (AttentionTypeCpp, CacheTransBufferManager,
create_kv_cache_transceiver)
from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver
from .model_engine import KV_CACHE_MANAGER_KEY, PyTorchModelEngine
from .py_executor import PyExecutor
from .resource_manager import (KVCacheManager, MambaHybridCacheManager,
Expand Down Expand Up @@ -151,29 +149,6 @@ def get_token_num_for_estimation(executor_config, model_config):
return None


def get_cache_transceiver_prealloc_size(executor_config: ExecutorConfig,
model_config: PyTorchModelEngine,
mapping: Mapping):
if (os.getenv("TRTLLM_USE_MPI_KVCACHE")
or os.getenv("TRTLLM_USE_UCX_KVCACHE")):
kv_size_per_token = int(get_cache_size_per_token(model_config, mapping))
logger.info(
f"get_cache_transceiver_prealloc_size kv_size_per_token: {kv_size_per_token} , executor_config.cache_transceiver_config: {executor_config.cache_transceiver_config}"
)
if executor_config.cache_transceiver_config is not None:
logger.info(
f"get_cache_transceiver_prealloc_size executor_config.cache_transceiver_config.max_num_tokens: {executor_config.cache_transceiver_config.max_num_tokens}"
)
return CacheTransBufferManager.pre_alloc_buffer_size(
executor_config.cache_transceiver_config.max_num_tokens,
kv_size_per_token)
else:
return CacheTransBufferManager.pre_alloc_buffer_size(
None, kv_size_per_token)
else:
return 0


def estimate_max_kv_cache_tokens(py_executor: PyExecutor,
model_engine: PyTorchModelEngine,
executor_config: ExecutorConfig,
Expand Down Expand Up @@ -221,12 +196,7 @@ def estimate_max_kv_cache_tokens(py_executor: PyExecutor,
total_used_bytes = total_gpu_memory - end
activation_bytes = torch_peak_memory - model_bytes
extra_cost = max(total_used_bytes - torch_used_bytes, 0)
kv_cache_transceiver_prealloc_size = get_cache_transceiver_prealloc_size(
executor_config, model_engine.model.model_config, mapping)
logger.info(
f"kv_cache_transceiver_prealloc_size: {kv_cache_transceiver_prealloc_size}"
)
peak_memory = torch_peak_memory + extra_cost + kv_cache_transceiver_prealloc_size
peak_memory = torch_peak_memory + extra_cost
logger.info(
f"Memory dynamically allocated during inference (inside torch) in memory usage profiling: {activation_bytes / (GB):.2f} GiB"
)
Expand Down