@@ -1165,3 +1165,80 @@ def test_kv_connector_handles_preemption():
1165
1165
# All memory should be freed since nothing is running.
1166
1166
assert scheduler .kv_cache_manager .block_pool .get_num_free_blocks () \
1167
1167
== NUM_BLOCKS - 1
1168
+
1169
+
1170
+ def make_output (scheduler : Scheduler ):
1171
+ return ModelRunnerOutput (
1172
+ req_ids = [req .request_id for req in scheduler .running ],
1173
+ req_id_to_index = {
1174
+ req .request_id : i
1175
+ for i , req in enumerate (scheduler .running )
1176
+ },
1177
+ sampled_token_ids = [[1000 ]] * len (scheduler .running ),
1178
+ spec_token_ids = None ,
1179
+ logprobs = None ,
1180
+ prompt_logprobs_dict = {},
1181
+ )
1182
+
1183
+
1184
+ def assert_scheduler_empty (scheduler : Scheduler ):
1185
+ """Confirm the scheduler is "empty" - i.e. no leaks."""
1186
+ # Scheduler Metadata.
1187
+ assert len (scheduler .requests ) == 0
1188
+ assert len (scheduler .waiting ) == 0
1189
+ assert len (scheduler .running ) == 0
1190
+ assert len (scheduler .finished_req_ids ) == 0
1191
+ assert len (scheduler ._cached_reqs_data ) == 0
1192
+
1193
+ # EncoderCacheManager.
1194
+ assert len (scheduler .encoder_cache_manager .freed ) == 0
1195
+ assert len (scheduler .encoder_cache_manager .cached ) == 0
1196
+
1197
+ # KVCache Manager.
1198
+ assert len (scheduler .kv_cache_manager .req_to_blocks ) == 0
1199
+ assert len (scheduler .kv_cache_manager .req_to_block_hashes ) == 0
1200
+ assert len (scheduler .kv_cache_manager .num_cached_block ) == 0
1201
+ num_free_blocks = (
1202
+ scheduler .kv_cache_manager .block_pool .free_block_queue .num_free_blocks )
1203
+ assert num_free_blocks == (
1204
+ scheduler .kv_cache_manager .block_pool .num_gpu_blocks - 1 )
1205
+
1206
+ # NOTE(rob): just the ref count on blocks will be 0. The hash
1207
+ # value, etc will remain since we lazily evict for prefix cache.
1208
+ for block in scheduler .kv_cache_manager .block_pool .blocks :
1209
+ assert block .ref_cnt == 0
1210
+ # assert block._block_hash is None
1211
+ # assert (
1212
+ # len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
1213
+ # ) == 0)
1214
+
1215
+
1216
+ def test_memory_leak ():
1217
+ """Test that we do not have a memory leak."""
1218
+
1219
+ scheduler = create_scheduler (enable_prefix_caching = True )
1220
+
1221
+ NUM_REQUESTS = 5
1222
+ NUM_TOKENS = 10
1223
+ MAX_TOKENS = 10
1224
+ requests = create_requests (num_requests = NUM_REQUESTS ,
1225
+ num_tokens = NUM_TOKENS ,
1226
+ max_tokens = MAX_TOKENS )
1227
+
1228
+ # Add each request.
1229
+ for request in requests :
1230
+ scheduler .add_request (request )
1231
+ scheduler_output = scheduler .schedule ()
1232
+ model_runner_output = make_output (scheduler )
1233
+ scheduler .update_from_output (scheduler_output , model_runner_output )
1234
+
1235
+ # Iterate until done.
1236
+ while True :
1237
+ scheduler_output = scheduler .schedule ()
1238
+ if len (scheduler .running ) == 0 :
1239
+ break
1240
+ model_runner_output = make_output (scheduler )
1241
+ scheduler .update_from_output (scheduler_output , model_runner_output )
1242
+
1243
+ # Confirm no memory leak.
1244
+ assert_scheduler_empty (scheduler )
0 commit comments