From f78db3d43d24fef0a8471fcfd6380851593d94d1 Mon Sep 17 00:00:00 2001 From: Eustache Le Bihan Date: Tue, 17 Dec 2024 14:43:39 +0100 Subject: [PATCH] fix check_modular_conversion --- .../models/moonshine/modular_moonshine.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 1f718524f6658d..ddd2e864a46f47 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -72,7 +72,7 @@ class MoonshineConfig(PretrainedConfig): The non-linear activation function (function or string) in the decoder. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. decoder_start_token_id (`int`, *optional*, defaults to 1): Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids` @@ -80,20 +80,20 @@ class MoonshineConfig(PretrainedConfig): the task. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. min_rotary_ndims (`int`, *optional*, defaults to 32): The minimum number of dimensions of the RoPE. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - ff_mult (`int`, *optional*, defaults to 4): - Factor by which to scale the intermediate size. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. qk_layernorm (`bool`, *optional*, defaults to `False`): Whether or not to normalize the Queries and Keys after projecting the hidden states. + ff_mult (`int`, *optional*, defaults to 4): + Factor by which to scale the intermediate size. bos_token_id (`int`, *optional*, defaults to 1): Denotes beginning of sequences token id. eos_token_id (`int`, *optional*, defaults to 2): @@ -110,10 +110,10 @@ class MoonshineConfig(PretrainedConfig): actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`. mask_time_length (`int`, *optional*, defaults to 10): Length of vector span along the time axis. - mask_time_min_masks (`int`, *optional*, defaults to 2),: The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' + mask_time_min_masks (``, *optional*, defaults to 2): mask_feature_prob (`float`, *optional*, defaults to 0.0): Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over @@ -123,10 +123,10 @@ class MoonshineConfig(PretrainedConfig): True`. mask_feature_length (`int`, *optional*, defaults to 10): Length of vector span along the feature axis. - mask_feature_min_masks (`int`, *optional*, defaults to 0),: The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`. + mask_feature_min_masks (``, *optional*, defaults to 0): Example: