diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py index 3c76ef78ae4288..7278a74eced517 100644 --- a/src/transformers/models/ernie/configuration_ernie.py +++ b/src/transformers/models/ernie/configuration_ernie.py @@ -81,7 +81,8 @@ class ErnieConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - pad_token_id (``, *optional*, defaults to 0): + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to diff --git a/src/transformers/models/ernie_m/configuration_ernie_m.py b/src/transformers/models/ernie_m/configuration_ernie_m.py index 6f16454a1df9eb..85917dc8288deb 100644 --- a/src/transformers/models/ernie_m/configuration_ernie_m.py +++ b/src/transformers/models/ernie_m/configuration_ernie_m.py @@ -65,14 +65,14 @@ class ErnieMConfig(PretrainedConfig): The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input sequence. initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the normal initializer for initializing all weight matrices. - The index of padding token in the token vocabulary. - pad_token_id (`int`, *optional*, defaults to 1): + The standard deviation of the normal initializer for initializing all weight matrices. The index of padding + token in the token vocabulary. + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. classifier_dropout (`float`, *optional*): The dropout ratio for the classification head. - is_decoder (``, *optional*, defaults to `False`): act_dropout (`float`, *optional*, defaults to 0.0): This dropout probability is used in `ErnieMEncoderLayer` after activation. @@ -98,7 +98,6 @@ def __init__( pad_token_id: int = 1, layer_norm_eps: float = 1e-05, classifier_dropout=None, - is_decoder=False, act_dropout=0.0, **kwargs, ): @@ -115,5 +114,4 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.classifier_dropout = classifier_dropout - self.is_decoder = is_decoder self.act_dropout = act_dropout