Skip to content

Commit

Permalink
nits
Browse files Browse the repository at this point in the history
  • Loading branch information
eustlb committed Dec 16, 2024
1 parent 72ba8c4 commit f548504
Show file tree
Hide file tree
Showing 9 changed files with 158 additions and 278 deletions.
20 changes: 10 additions & 10 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5351,7 +5351,6 @@
)
from .models.gpt_neo import GPTNeoConfig
from .models.gpt_neox import GPTNeoXConfig
from .models.moonshine import MoonshineConfig
from .models.gpt_neox_japanese import (
GPTNeoXJapaneseConfig,
)
Expand Down Expand Up @@ -5499,6 +5498,7 @@
from .models.mobilevitv2 import (
MobileViTV2Config,
)
from .models.moonshine import MoonshineConfig
from .models.moshi import (
MoshiConfig,
MoshiDepthConfig,
Expand Down Expand Up @@ -6022,7 +6022,6 @@
from .models.gemma import GemmaTokenizerFast
from .models.gpt2 import GPT2TokenizerFast
from .models.gpt_neox import GPTNeoXTokenizerFast
from .models.moonshine import MoonshineTokenizer
from .models.gpt_neox_japanese import GPTNeoXJapaneseTokenizer
from .models.herbert import HerbertTokenizerFast
from .models.layoutlm import LayoutLMTokenizerFast
Expand All @@ -6037,6 +6036,7 @@
from .models.mbart import MBartTokenizerFast
from .models.mbart50 import MBart50TokenizerFast
from .models.mobilebert import MobileBertTokenizerFast
from .models.moonshine import MoonshineTokenizer
from .models.mpnet import MPNetTokenizerFast
from .models.mt5 import MT5TokenizerFast
from .models.mvp import MvpTokenizerFast
Expand Down Expand Up @@ -7116,14 +7116,6 @@
GPTNeoXModel,
GPTNeoXPreTrainedModel,
)
from .models.moonshine import (
MoonshineForCausalLM,
MoonshineForQuestionAnswering,
MoonshineForSequenceClassification,
MoonshineForTokenClassification,
MoonshineModel,
MoonshinePreTrainedModel,
)
from .models.gpt_neox_japanese import (
GPTNeoXJapaneseForCausalLM,
GPTNeoXJapaneseModel,
Expand Down Expand Up @@ -7463,6 +7455,14 @@
MobileViTV2Model,
MobileViTV2PreTrainedModel,
)
from .models.moonshine import (
MoonshineForCausalLM,
MoonshineForQuestionAnswering,
MoonshineForSequenceClassification,
MoonshineForTokenClassification,
MoonshineModel,
MoonshinePreTrainedModel,
)
from .models.moshi import (
MoshiForCausalLM,
MoshiForConditionalGeneration,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@
gpt_bigcode,
gpt_neo,
gpt_neox,
moonshine,
gpt_neox_japanese,
gpt_sw3,
gptj,
Expand Down Expand Up @@ -163,6 +162,7 @@
mobilenet_v2,
mobilevit,
mobilevitv2,
moonshine,
moshi,
mpnet,
mpt,
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@
("gpt_bigcode", "GPTBigCodeConfig"),
("gpt_neo", "GPTNeoConfig"),
("gpt_neox", "GPTNeoXConfig"),
("moonshine", "MoonshineConfig"),
("gpt_neox_japanese", "GPTNeoXJapaneseConfig"),
("gptj", "GPTJConfig"),
("gptsan-japanese", "GPTSanJapaneseConfig"),
Expand Down Expand Up @@ -181,6 +180,7 @@
("mobilenet_v2", "MobileNetV2Config"),
("mobilevit", "MobileViTConfig"),
("mobilevitv2", "MobileViTV2Config"),
("moonshine", "MoonshineConfig"),
("moshi", "MoshiConfig"),
("mpnet", "MPNetConfig"),
("mpt", "MptConfig"),
Expand Down Expand Up @@ -426,7 +426,6 @@
("gpt_bigcode", "GPTBigCode"),
("gpt_neo", "GPT Neo"),
("gpt_neox", "GPT NeoX"),
("moonshine", "moonshine"),
("gpt_neox_japanese", "GPT NeoX Japanese"),
("gptj", "GPT-J"),
("gptsan-japanese", "GPTSAN-japanese"),
Expand Down Expand Up @@ -496,6 +495,7 @@
("mobilenet_v2", "MobileNetV2"),
("mobilevit", "MobileViT"),
("mobilevitv2", "MobileViTV2"),
("moonshine", "moonshine"),
("moshi", "Moshi"),
("mpnet", "MPNet"),
("mpt", "MPT"),
Expand Down
8 changes: 2 additions & 6 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@
("gpt_bigcode", "GPTBigCodeModel"),
("gpt_neo", "GPTNeoModel"),
("gpt_neox", "GPTNeoXModel"),
("moonshine", "MoonshineModel"),
("gpt_neox_japanese", "GPTNeoXJapaneseModel"),
("gptj", "GPTJModel"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
Expand Down Expand Up @@ -171,6 +170,7 @@
("mobilenet_v2", "MobileNetV2Model"),
("mobilevit", "MobileViTModel"),
("mobilevitv2", "MobileViTV2Model"),
("moonshine", "MoonshineModel"),
("moshi", "MoshiModel"),
("mpnet", "MPNetModel"),
("mpt", "MptModel"),
Expand Down Expand Up @@ -409,7 +409,6 @@
("gpt_bigcode", "GPTBigCodeForCausalLM"),
("gpt_neo", "GPTNeoForCausalLM"),
("gpt_neox", "GPTNeoXForCausalLM"),
("moonshine", "MoonshineForCausalLM"),
("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"),
("gptj", "GPTJForCausalLM"),
("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"),
Expand All @@ -426,6 +425,7 @@
("mega", "MegaForMaskedLM"),
("megatron-bert", "MegatronBertForCausalLM"),
("mobilebert", "MobileBertForMaskedLM"),
("moonshine", "MoonshineForConditionalGeneration"),
("mpnet", "MPNetForMaskedLM"),
("mpt", "MptForCausalLM"),
("mra", "MraForMaskedLM"),
Expand Down Expand Up @@ -496,7 +496,6 @@
("gpt_bigcode", "GPTBigCodeForCausalLM"),
("gpt_neo", "GPTNeoForCausalLM"),
("gpt_neox", "GPTNeoXForCausalLM"),
("moonshine", "MoonshineForCausalLM"),
("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"),
("gptj", "GPTJForCausalLM"),
("granite", "GraniteForCausalLM"),
Expand Down Expand Up @@ -954,7 +953,6 @@
("gpt_bigcode", "GPTBigCodeForSequenceClassification"),
("gpt_neo", "GPTNeoForSequenceClassification"),
("gpt_neox", "GPTNeoXForSequenceClassification"),
("moonshine", "MoonshineForSequenceClassification"),
("gptj", "GPTJForSequenceClassification"),
("ibert", "IBertForSequenceClassification"),
("jamba", "JambaForSequenceClassification"),
Expand Down Expand Up @@ -1043,7 +1041,6 @@
("gpt2", "GPT2ForQuestionAnswering"),
("gpt_neo", "GPTNeoForQuestionAnswering"),
("gpt_neox", "GPTNeoXForQuestionAnswering"),
("moonshine", "MoonshineForQuestionAnswering"),
("gptj", "GPTJForQuestionAnswering"),
("ibert", "IBertForQuestionAnswering"),
("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
Expand Down Expand Up @@ -1147,7 +1144,6 @@
("gpt_bigcode", "GPTBigCodeForTokenClassification"),
("gpt_neo", "GPTNeoForTokenClassification"),
("gpt_neox", "GPTNeoXForTokenClassification"),
("moonshine", "MoonshineForTokenClassification"),
("ibert", "IBertForTokenClassification"),
("layoutlm", "LayoutLMForTokenClassification"),
("layoutlmv2", "LayoutLMv2ForTokenClassification"),
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
("mctct", "MCTCTProcessor"),
("mgp-str", "MgpstrProcessor"),
("mllama", "MllamaProcessor"),
("moonshine", "Wav2Vec2Processor"),
("oneformer", "OneFormerProcessor"),
("owlv2", "Owlv2Processor"),
("owlvit", "OwlViTProcessor"),
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,8 @@
("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("moonshine", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
Expand Down
57 changes: 4 additions & 53 deletions src/transformers/models/moonshine/configuration_moonshine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
# the file from the modular. If any change should be done, please apply the change to the
# modular_moonshine.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

from ...configuration_utils import PretrainedConfig


Expand Down Expand Up @@ -42,8 +41,6 @@ class MoonshineConfig(PretrainedConfig):
The non-linear activation function (function or string) in the encoder.
decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. TODO: check this
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
Expand All @@ -56,10 +53,8 @@ class MoonshineConfig(PretrainedConfig):
Whether or not the model should return the last key/values attentions (not used by all models).
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether the model is used as an encoder/decoder or not.
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings. TODO: check this
partial_rotary_factor (`float`, *optional*, defaults to 0.5):
Percentage of the query and keys which will have rotary embedding. TODO: check this
min_rotary_ndims (`int`, *optional*, defaults to 32):
The minimum number of dimensions of the RoPE.
ff_mult (`int`, *optional*, defaults to 4):
Factor by which to scale the intermediate size.
attention_bias (`bool`, *optional*, defaults to `False`):
Expand All @@ -68,43 +63,6 @@ class MoonshineConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
qk_layernorm (`bool`, *optional*, defaults to `False`):
Whether or not to normalize the Queries and Keys after projecting the hidden states.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
bos_token_id (`int`, *optional*, defaults to 1):
Denotes beginning of sequences token id.
eos_token_id (`int`, *optional*, defaults to 2):
Expand Down Expand Up @@ -167,18 +125,15 @@ def __init__(
num_key_value_heads=None,
encoder_hidden_act="gelu",
decoder_hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
layer_norm_eps=1e-5,
decoder_start_token_id=1,
use_cache=True,
is_encoder_decoder=True,
rope_theta=10000.0,
partial_rotary_factor=0.5,
min_rotary_ndims=32,
attention_bias=False,
attention_dropout=0.0,
qk_layernorm=False,
rope_scaling=None,
ff_mult=4,
bos_token_id=1,
eos_token_id=2,
Expand All @@ -203,19 +158,15 @@ def __init__(
self.num_key_value_heads = num_key_value_heads
self.encoder_hidden_act = encoder_hidden_act
self.decoder_hidden_act = decoder_hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.decoder_start_token_id = decoder_start_token_id
self.use_cache = use_cache
self.is_encoder_decoder = is_encoder_decoder
self.rope_theta = rope_theta
self.partial_rotary_factor = partial_rotary_factor

self.min_rotary_ndims = min_rotary_ndims
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.qk_layernorm = qk_layernorm
self.rope_scaling = rope_scaling
self.ff_mult = ff_mult

# fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
Expand Down
Loading

0 comments on commit f548504

Please sign in to comment.