Generate: assistant should sample when the main model samples (huggin…

…gface#33534)
Cyrilvallez · Sep 20, 2024 · 77c5d59 · 77c5d59
1 parent dc8b6ea
commit 77c5d59
Showing 1 changed file with 0 additions and 6 deletions.
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -160,12 +160,6 @@ def __init__(
         self.generation_config.output_scores = True
         self.generation_config.assistant_confidence_threshold = self.assistant_confidence_threshold
 
-        # Disable sampling -- this implementation of assisted generation/speculative decoding uses the assistant
-        # greedily to maximize matches. Disables sampling-related flags to prevent warnings
-        self.generation_config.do_sample = False
-        for attr in ("temperature", "top_p", "min_p", "typical_p", "top_k", "epsilon_cutoff", "eta_cutoff"):
-            setattr(self.generation_config, attr, None)
-
         # avoid unnecessary warnings that min_length is larger than max_new_tokens
         # remove the `MinLengthLogitsProcessor` if exists (NOTE: no need to check for `MinNewTokensLogitsProcessor`)
         self.main_model_min_length = self.generation_config.min_length