From 929134bf65ac986c12c423c30b0db8a239f3b195 Mon Sep 17 00:00:00 2001 From: Adam Ross <14985050+R055A@users.noreply.github.com> Date: Fri, 20 Oct 2023 09:49:55 +0200 Subject: [PATCH] [docstring] Fix docstring for speech-to-text config (#26883) * Fix docstring for speech-to-text config * Refactor doc line len <= 119 char * Remove Speech2TextConfig from OBJECTS_TO_IGNORE * Fix Speech2TextConfig doc str * Fix Speech2TextConfig doc using doc-builder * Refactor Speech2TextConfig doc --- .../configuration_speech_to_text.py | 56 +++++++++++-------- utils/check_docstrings.py | 1 - 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py index 8bad1972e09215..89d8e9a9105b99 100644 --- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py +++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py @@ -30,7 +30,7 @@ class Speech2TextConfig(PretrainedConfig): r""" - This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate an + This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used to instantiate a Speech2Text model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text [facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture. @@ -40,26 +40,36 @@ class Speech2TextConfig(PretrainedConfig): Args: - vocab_size (`int`, *optional*, defaults to 50265): + vocab_size (`int`, *optional*, defaults to 10000): Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`Speech2TextModel`] - d_model (`int`, *optional*, defaults to 1024): - Dimensionality of the layers and the pooler layer. encoder_layers (`int`, *optional*, defaults to 12): Number of encoder layers. - decoder_layers (`int`, *optional*, defaults to 12): - Number of decoder layers. - encoder_attention_heads (`int`, *optional*, defaults to 16): + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (often named feed-forward) layer in encoder. + encoder_attention_heads (`int`, *optional*, defaults to 4): Number of attention heads for each attention layer in the Transformer encoder. - decoder_attention_heads (`int`, *optional*, defaults to 16): - Number of attention heads for each attention layer in the Transformer decoder. - decoder_ffn_dim (`int`, *optional*, defaults to 4096): - Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. - encoder_ffn_dim (`int`, *optional*, defaults to 4096): + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. - activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + decoder_attention_heads (`int`, *optional*, defaults to 4): + Number of attention heads for each attention layer in the Transformer decoder. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for + more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for + more details. + use_cache (`bool`, *optional*, defaults to `True`): + Whether the model should return the last key/values attentions (not used by all models). + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is set up as an encoder-decoder architecture for sequence-to-sequence tasks. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported. + d_model (`int`, *optional*, defaults to 256): + Dimensionality of the layers and the pooler layer. dropout (`float`, *optional*, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -68,18 +78,20 @@ class Speech2TextConfig(PretrainedConfig): The dropout ratio for activations inside the fully connected layer. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - encoder_layerdrop (`float`, *optional*, defaults to 0.0): - The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) - for more details. - decoder_layerdrop (`float`, *optional*, defaults to 0.0): - The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) - for more details. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). + decoder_start_token_id (`int`, *optional*, defaults to 2): + The initial token ID of the decoder when decoding sequences. + scale_embedding (`bool`, *optional*, defaults to `True`): + Whether the embeddings are scaled by the square root of `d_model`. + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 0): + The id of the beginning-of-sequence token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the end-of-sequence token. max_source_positions (`int`, *optional*, defaults to 6000): The maximum sequence length of log-mel filter-bank features that this model might ever be used with. max_target_positions (`int`, *optional*, defaults to 1024): - The maximum sequence length that this model might ever be used with. Typically set this to something large + The maximum sequence length that this model might ever be used with. Typically, set this to something large just in case (e.g., 512 or 1024 or 2048). num_conv_layers (`int`, *optional*, defaults to 2): Number of 1D convolutional layers in the conv module. diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 2417b47b7c772c..aeacafbc289e8c 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -467,7 +467,6 @@ "SpecialTokensMixin", "Speech2Text2Config", "Speech2Text2Tokenizer", - "Speech2TextConfig", "Speech2TextTokenizer", "SpeechEncoderDecoderModel", "SpeechT5Config",