diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 572e052cdcc..964068ae80e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -249,6 +249,7 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]: return engine_core_outputs def shutdown(self): + self.structured_output_manager.clear_backend() if self.model_executor: self.model_executor.shutdown() diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 218af43deb6..0fd66c07296 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -107,3 +107,7 @@ def grammar_bitmask( # np.ndarray, because that is much more efficient for serialization # and deserialization when sending this to the GPU workers. return bitmask_tensor.numpy() + + def clear_backend(self) -> None: + if self.backend is not None: + self.backend.destroy() diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 0edb15558dc..0d79960d82e 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -110,6 +110,9 @@ def allocate_token_bitmask(self, max_num_seqs: int): return llguidance_torch.allocate_token_bitmask( max_num_seqs, self.ll_tokenizer.vocab_size) + def destroy(self): + pass + @dataclass class GuidanceGrammar(StructuredOutputGrammar): diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 6dc2a92411d..306e4aa0196 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -87,3 +87,9 @@ def allocate_token_bitmask(self, max_num_seqs: int): max_num_seqs (int): The maximum number of sequences for which to allocate the bitmask. """ + + @abstractmethod + def destroy(self): + """ + Backend-specific cleanup. + """ diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 1e4470153e3..1d7d9efc0b3 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -126,6 +126,9 @@ def compile_grammar(self, request_type: StructuredOutputOptions, def allocate_token_bitmask(self, max_num_seqs: int): return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size) + def destroy(self): + del self.compiler + @dataclass class XgrammarGrammar(StructuredOutputGrammar):