@@ -288,7 +288,6 @@ def response_loop(self):
288
288
if item is None :
289
289
break
290
290
response_sender , response , response_flag = item
291
- del item
292
291
try :
293
292
response_sender .send (response , response_flag )
294
293
except Exception as e :
@@ -298,9 +297,6 @@ def response_loop(self):
298
297
finally :
299
298
if response_flag == pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL :
300
299
self .ongoing_request_count -= 1
301
- del response_sender
302
- if self .ongoing_request_count == 0 :
303
- gc .collect ()
304
300
305
301
def create_response (self , vllm_output , prepend_input ):
306
302
"""
@@ -447,9 +443,6 @@ async def generate(self, request):
447
443
finally :
448
444
if decrement_ongoing_request_count :
449
445
self .ongoing_request_count -= 1
450
- del response_sender
451
- if self .ongoing_request_count == 0 :
452
- gc .collect ()
453
446
454
447
def verify_loras (self , request ):
455
448
# We will check if the requested lora exists here, if not we will send a
@@ -527,3 +520,9 @@ def finalize(self):
527
520
if self ._response_thread is not None :
528
521
self ._response_thread .join ()
529
522
self ._response_thread = None
523
+
524
+ # When using parallel tensors, the stub process may not shutdown due to
525
+ # unreleased references, so manually run the garbage collector once.
526
+ self .logger .log_info ("[vllm] Running Garbage Collector on finalize..." )
527
+ gc .collect ()
528
+ self .logger .log_info ("[vllm] Garbage Collector on finalize... done" )
0 commit comments