vllm-project · vllm-bot · Apr 24, 2025 · Apr 22, 2025 · Apr 24, 2025 · aarnphm
@@ -249,6 +249,7 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
         return engine_core_outputs
 
     def shutdown(self):
+        self.structured_output_manager.clear_backend()
         if self.model_executor:
             self.model_executor.shutdown()
 

@@ -107,3 +107,7 @@ def grammar_bitmask(
         # np.ndarray, because that is much more efficient for serialization
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
+
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()
@@ -110,6 +110,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
         return llguidance_torch.allocate_token_bitmask(
             max_num_seqs, self.ll_tokenizer.vocab_size)
 
+    def destroy(self):
+        pass
+
 
 @dataclass
 class GuidanceGrammar(StructuredOutputGrammar):

@@ -87,3 +87,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
             max_num_seqs (int): The maximum number of sequences for which
               to allocate the bitmask.
         """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        Backend-specific cleanup.
+        """
@@ -126,6 +126,9 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
     def allocate_token_bitmask(self, max_num_seqs: int):
         return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
 
+    def destroy(self):
+        del self.compiler
+
 
 @dataclass
 class XgrammarGrammar(StructuredOutputGrammar):