diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index eacaf1078..44924ef4d 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1050,10 +1050,9 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: scheduled_seq_group.seq_group) scheduler_time = time.time() - scheduler_start_time - # Add this to scheduler time to all the sequences that are either currently running or - # swapped out. This is not added to the ones waiting on the queue and never scheduled. + # Add this to scheduler time to all the sequences that are currently running. # This will help estimate if the scheduler is a significant component in the e2e latency. - for seq_group in self.running + self.swapped: + for seq_group in self.running: if seq_group.metrics.scheduler_time is not None: seq_group.metrics.scheduler_time += scheduler_time else: diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index b7381372c..9a0bbe92a 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -270,7 +270,6 @@ def execute_model( if not get_pp_group().is_first_rank: intermediate_tensors = IntermediateTensors( get_pp_group().recv_tensor_dict()) - output = self.model_runner.execute_model( model_input, self.kv_cache[worker_input.virtual_engine] if self.kv_cache is not None else None, intermediate_tensors,