Skip to content

Commit

Permalink
Enable passing epsilon when building norm layers (#1399)
Browse files Browse the repository at this point in the history
* adding eps to building norms

* adding norm eps to layers and configs

* adding docstrings
  • Loading branch information
gupta-abhay authored Jul 26, 2024
1 parent 7de4969 commit d8c1552
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 0 deletions.
7 changes: 7 additions & 0 deletions llmfoundry/models/layers/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ def __init__(
softmax_scale: Optional[float] = None,
attn_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
bias: bool = True,
Expand Down Expand Up @@ -520,6 +521,7 @@ def __init__(
self.q_ln = build_norm(
name=norm_type.lower(),
normalized_shape=norm_size,
eps=norm_eps,
device=device,
)
if self.reuse_kv_layer_idx is None:
Expand All @@ -528,6 +530,7 @@ def __init__(
self.k_ln = build_norm(
name=norm_type.lower(),
normalized_shape=norm_size,
eps=norm_eps,
device=device,
)

Expand Down Expand Up @@ -796,6 +799,7 @@ def __init__(
softmax_scale: Optional[float] = None,
attn_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
bias: bool = True,
Expand All @@ -814,6 +818,7 @@ def __init__(
softmax_scale=softmax_scale,
attn_pdrop=attn_pdrop,
norm_type=norm_type,
norm_eps=norm_eps,
fc_type=fc_type,
device=device,
bias=bias,
Expand Down Expand Up @@ -841,6 +846,7 @@ def __init__(
softmax_scale: Optional[float] = None,
attn_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
bias: bool = True,
Expand All @@ -859,6 +865,7 @@ def __init__(
softmax_scale=softmax_scale,
attn_pdrop=attn_pdrop,
norm_type=norm_type,
norm_eps=norm_eps,
fc_type=fc_type,
device=device,
bias=bias,
Expand Down
7 changes: 7 additions & 0 deletions llmfoundry/models/layers/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
ffn_config: Optional[Dict] = None,
resid_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
fc_type: Optional[dict[str, Any]] = None,
device: Optional[str] = None,
no_bias: bool = False,
Expand Down Expand Up @@ -84,6 +85,7 @@ def __init__(
fc_type=fc_type,
resid_pdrop=resid_pdrop,
norm_type=norm_type,
norm_eps=norm_eps,
device=device,
no_bias=no_bias,
)
Expand All @@ -99,6 +101,7 @@ def __init__(
self.norm_1 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)
self.attn = build_attention_layer(
Expand All @@ -117,6 +120,7 @@ def __init__(
self.norm_2 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)

Expand Down Expand Up @@ -260,6 +264,7 @@ def __init__(
fc_type: Optional[dict[str, Any]] = None,
resid_pdrop: float = 0.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
device: Optional[str] = None,
no_bias: bool = False,
**kwargs: Any,
Expand All @@ -283,6 +288,7 @@ def __init__(
self.norm_1 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)
self.attn = build_attention_layer(
Expand All @@ -302,6 +308,7 @@ def __init__(
self.norm_2 = build_norm(
name=norm_type.lower(),
normalized_shape=d_model,
eps=norm_eps,
device=device,
)
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
Expand Down
2 changes: 2 additions & 0 deletions llmfoundry/models/layers/layer_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@
def build_norm(
name: str,
normalized_shape: Union[int, List[int], torch.Size],
eps: Optional[float] = 1e-5,
device: Optional[str] = None,
):
kwargs = {
'normalized_shape': normalized_shape,
'eps': eps,
'device': device,
}

Expand Down
3 changes: 3 additions & 0 deletions llmfoundry/models/mpt/configuration_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def __init__(
no_bias: bool = False,
embedding_fraction: float = 1.0,
norm_type: str = 'low_precision_layernorm',
norm_eps: float = 1e-05,
use_cache: bool = False,
init_config: Optional[Dict] = None,
fc_type: Union[str, Dict] = 'torch',
Expand Down Expand Up @@ -101,6 +102,7 @@ def __init__(
no_bias (bool): Whether to use bias in all layers.
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
norm_type (str): choose type of norm to use
norm_eps (float): epsilon value for norm layer
use_cache (bool): Whether or not the model should return the last key/values attentions
init_config (Dict): A dictionary used to configure the model initialization:
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
Expand Down Expand Up @@ -168,6 +170,7 @@ def __init__(
self.no_bias = no_bias
self.embedding_fraction = embedding_fraction
self.norm_type = norm_type
self.norm_eps = norm_eps
self.use_cache = use_cache
self.init_config = init_config if init_config is not None else copy.deepcopy(
init_config_defaults,
Expand Down
1 change: 1 addition & 0 deletions llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@ def __init__(self, config: MPTConfig):
self.norm_f = build_norm(
name=config.norm_type.lower(),
normalized_shape=config.d_model,
eps=config.norm_eps,
device=config.init_device,
)

Expand Down

0 comments on commit d8c1552

Please sign in to comment.