From 84f03d6d687e9effea9da421cecddc9825f84347 Mon Sep 17 00:00:00 2001 From: bot-of-gabrieldemarmiesse <47452237+bot-of-gabrieldemarmiesse@users.noreply.github.com> Date: Thu, 27 Aug 2020 04:37:59 +0200 Subject: [PATCH] Beautifier layers doc (#2117) Co-authored-by: github-actions[bot] --- .../layers/multihead_attention.py | 41 ++++++++-------- tensorflow_addons/layers/netvlad.py | 10 ++-- tensorflow_addons/layers/normalizations.py | 9 ++-- tensorflow_addons/layers/optical_flow.py | 10 ++-- tensorflow_addons/layers/poincare.py | 8 ++-- tensorflow_addons/layers/polynomial.py | 29 ++++++------ tensorflow_addons/layers/snake.py | 2 +- tensorflow_addons/layers/sparsemax.py | 4 +- .../layers/spatial_pyramid_pooling.py | 2 +- .../layers/spectral_normalization.py | 46 +++++++++--------- tensorflow_addons/layers/tlu.py | 8 ++-- tensorflow_addons/layers/wrappers.py | 47 ++++++++++--------- 12 files changed, 107 insertions(+), 109 deletions(-) diff --git a/tensorflow_addons/layers/multihead_attention.py b/tensorflow_addons/layers/multihead_attention.py index 5d275da525..c939b6ec74 100644 --- a/tensorflow_addons/layers/multihead_attention.py +++ b/tensorflow_addons/layers/multihead_attention.py @@ -20,49 +20,48 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class MultiHeadAttention(tf.keras.layers.Layer): - r""" - MultiHead Attention layer. + r"""MultiHead Attention layer. Defines the MultiHead Attention operation as described in [Attention Is All You Need](https://arxiv.org/abs/1706.03762) which takes in the tensors `query`, `key`, and `value`, and returns the dot-product attention between them: - ```python - mha = MultiHeadAttention(head_size=128, num_heads=12) + ```python + mha = MultiHeadAttention(head_size=128, num_heads=12) - query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth) - key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth) - value = tf.random.uniform((32, 15, 400)) # (batch_size, key_elements, value_depth) + query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth) + key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth) + value = tf.random.uniform((32, 15, 400)) # (batch_size, key_elements, value_depth) - attention = mha([query, key, value]) # (batch_size, query_elements, value_depth) - ``` + attention = mha([query, key, value]) # (batch_size, query_elements, value_depth) + ``` If `value` is not given then internally `value = key` will be used: - ```python - mha = MultiHeadAttention(head_size=128, num_heads=12) + ```python + mha = MultiHeadAttention(head_size=128, num_heads=12) - query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth) - key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth) + query = tf.random.uniform((32, 20, 200)) # (batch_size, query_elements, query_depth) + key = tf.random.uniform((32, 15, 300)) # (batch_size, key_elements, key_depth) - attention = mha([query, key]) # (batch_size, query_elements, key_depth) - ``` + attention = mha([query, key]) # (batch_size, query_elements, key_depth) + ``` Arguments: head_size: int, dimensionality of the `query`, `key` and `value` tensors - after the linear transformation. + after the linear transformation. num_heads: int, number of attention heads. output_size: int, dimensionality of the output space, if `None` then the - input dimension of - `value` or `key` will be used, default `None`. + input dimension of `value` or `key` will be used, + default `None`. dropout: float, `rate` parameter for the dropout layer that is - applied to attention after softmax, + applied to attention after softmax, default `0`. use_projection_bias: bool, whether to use a bias term after the linear - output projection. + output projection. return_attn_coef: bool, if `True`, return the attention coefficients as - an additional output argument. + an additional output argument. kernel_initializer: initializer, initializer for the kernel weights. kernel_regularizer: regularizer, regularizer for the kernel weights. kernel_constraint: constraint, constraint for the kernel weights. diff --git a/tensorflow_addons/layers/netvlad.py b/tensorflow_addons/layers/netvlad.py index 82246ad4f0..a070b8b796 100644 --- a/tensorflow_addons/layers/netvlad.py +++ b/tensorflow_addons/layers/netvlad.py @@ -23,13 +23,11 @@ class NetVLAD(tf.keras.layers.Layer): """Applies NetVLAD to the input. - This is a fully-differentiable version of "Vector of Locally Aggregated Descriptors" commonly used in image - retrieval. It is also used in audio retrieval, and audio represenation learning (ex - "Towards Learning a Universal Non-Semantic Representation of Speech", https://arxiv.org/abs/2002.12764). + This is a fully-differentiable version of "Vector of Locally Aggregated Descriptors" commonly used in image + retrieval. - "NetVLAD: CNN architecture for weakly supervised place recognition" - Relja Arandjelovic, Petr Gronat, Akihiko Torii, Tomas Pajdla, Josef Sivic. - https://arxiv.org/abs/1511.07247 + See [NetVLAD: CNN architecture for weakly supervised place recognition](https://arxiv.org/abs/1511.07247), and. + [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764) Arguments: num_clusters: The number of clusters to use. diff --git a/tensorflow_addons/layers/normalizations.py b/tensorflow_addons/layers/normalizations.py index ca2cad07fc..8fe2910058 100644 --- a/tensorflow_addons/layers/normalizations.py +++ b/tensorflow_addons/layers/normalizations.py @@ -42,7 +42,7 @@ class GroupNormalization(tf.keras.layers.Layer): to number of channels), then this operation becomes identical to Instance Normalization. - Arguments + Arguments: groups: Integer, the number of groups for Group Normalization. Can be in the range [1, N] where N is the input dimension. The input dimension must be divisible by the number of groups. @@ -59,14 +59,15 @@ class GroupNormalization(tf.keras.layers.Layer): beta_constraint: Optional constraint for the beta weight. gamma_constraint: Optional constraint for the gamma weight. - Input shape + Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of integers, does not include the samples axis) when using this layer as the first layer in a model. - Output shape + Output shape: Same shape as input. - References + + References: - [Group Normalization](https://arxiv.org/abs/1803.08494) """ diff --git a/tensorflow_addons/layers/optical_flow.py b/tensorflow_addons/layers/optical_flow.py index fe65f1d59c..3653456af9 100644 --- a/tensorflow_addons/layers/optical_flow.py +++ b/tensorflow_addons/layers/optical_flow.py @@ -34,10 +34,7 @@ def _correlation_cost( ): """Correlation Cost Volume computation. - "FlowNet: Learning Optical Flow with Convolutional Networks" - Philipp Fischer, Alexey Dosovitskiy, Eddy Ilg, Philip Hausser, - Caner Hazirbas, Vladimir Golkov, Patrick van der Smagt, - Daniel Cremers, Thomas Brox. https://arxiv.org/abs/1504.06852 + See [FlowNet: Learning Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06852). Computes a cost volume using correlation for two inputs. For feature maps A, B with spatial dimensions w, h, c it computes @@ -142,9 +139,8 @@ def _correlation_cost_grad(op, grad_output): class CorrelationCost(tf.keras.layers.Layer): """Correlation Cost Layer. - This layer implements the correlation operation from FlowNet Learning - Optical Flow with Convolutional Networks (Fischer et al.): - https://arxiv.org/abs/1504.06 + This layer implements the correlation operation from [FlowNet Learning + Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06)(Fischer et al.). Args: kernel_size: An integer specifying the height and width of the diff --git a/tensorflow_addons/layers/poincare.py b/tensorflow_addons/layers/poincare.py index 21e5aaecf6..af82b670b9 100644 --- a/tensorflow_addons/layers/poincare.py +++ b/tensorflow_addons/layers/poincare.py @@ -21,12 +21,10 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class PoincareNormalize(tf.keras.layers.Layer): - """Project into the Poincare ball with norm <= 1.0 - epsilon. + """Project into the Poincare ball with `norm <= 1.0 - epsilon`. - https://en.wikipedia.org/wiki/Poincare_ball_model - - Used in Poincare Embeddings for Learning Hierarchical Representations - Maximilian Nickel, Douwe Kiela https://arxiv.org/pdf/1705.08039.pdf + See [Poincaré Embeddings for Learning Hierarchical Representations](https://arxiv.org/pdf/1705.08039.pdf), + and [wiki](https://en.wikipedia.org/wiki/Poincare_ball_model). For a 1-D tensor with `axis = 0`, computes diff --git a/tensorflow_addons/layers/polynomial.py b/tensorflow_addons/layers/polynomial.py index 7bae68fa16..8e1ed791a2 100644 --- a/tensorflow_addons/layers/polynomial.py +++ b/tensorflow_addons/layers/polynomial.py @@ -32,24 +32,23 @@ class PolynomialCrossing(tf.keras.layers.Layer): is the output of the previous `PolynomialCrossing` layer in the stack, i.e., the i-th `PolynomialCrossing` layer. - The output is x_{i+1} = x0 .* (W * x_i + diag_scale * x_i) + bias + xi, where .* designates elementwise - multiplication, W could be a full rank matrix, or a low rank matrix U*V to reduce the computational cost, + The output is `x[i+1] = x0 .* (W * x[i] + diag_scale * x[i]) + bias + x[i]`, where .* designates elementwise + multiplication, W could be a full rank matrix, or a low rank matrix `U*V` to reduce the computational cost, and diag_scale increases the diagonal of W to improve training stability (especially for the low rank case). - References - See [R. Wang](https://arxiv.org/pdf/1708.05123.pdf) + See [Deep & Cross Network for Ad Click Predictions](https://arxiv.org/pdf/1708.05123.pdf). Example: - ```python - # after embedding layer in a functional model: - input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64) - x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6)) - x1 = PolynomialCrossing(projection_dim=None)((x0, x0)) - x2 = PolynomialCrossing(projection_dim=None)((x0, x1)) - logits = tf.keras.layers.Dense(units=10)(x2) - model = tf.keras.Model(input, logits) - ``` + ```python + # after embedding layer in a functional model: + input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64) + x0 = tf.keras.layers.Embedding(input_dim=32, output_dim=6)) + x1 = PolynomialCrossing(projection_dim=None)((x0, x0)) + x2 = PolynomialCrossing(projection_dim=None)((x0, x1)) + logits = tf.keras.layers.Dense(units=10)(x2) + model = tf.keras.Model(input, logits) + ``` Arguments: projection_dim: project dimension to reduce the computational cost. @@ -69,10 +68,10 @@ class PolynomialCrossing(tf.keras.layers.Layer): bias_regularizer: Regularizer instance to use on bias vector. Input shape: - A tuple of 2 (batch_size, `input_dim`) dimensional inputs. + A tuple of 2 `(batch_size, input_dim)` dimensional inputs. Output shape: - A single (batch_size, `input_dim`) dimensional output. + A single `(batch_size, input_dim)` dimensional output. """ @typechecked diff --git a/tensorflow_addons/layers/snake.py b/tensorflow_addons/layers/snake.py index 03549f972c..01ff9f3c8a 100644 --- a/tensorflow_addons/layers/snake.py +++ b/tensorflow_addons/layers/snake.py @@ -26,7 +26,7 @@ class Snake(tf.keras.layers.Layer): """Snake layer to learn periodic functions with the trainable `frequency` scalar. - https://arxiv.org/abs/2006.08195 + See [Neural Networks Fail to Learn Periodic Functions and How to Fix It](https://arxiv.org/abs/2006.08195). Arguments: frequency_initializer: Initializer for the `frequency` scalar. diff --git a/tensorflow_addons/layers/sparsemax.py b/tensorflow_addons/layers/sparsemax.py index fd3ccf8ba3..d9e36f0021 100644 --- a/tensorflow_addons/layers/sparsemax.py +++ b/tensorflow_addons/layers/sparsemax.py @@ -20,11 +20,11 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class Sparsemax(tf.keras.layers.Layer): - """Sparsemax activation function [1]. + """Sparsemax activation function. The output shape is the same as the input shape. - [1]: https://arxiv.org/abs/1602.02068 + See [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://arxiv.org/abs/1602.02068). Arguments: axis: Integer, axis along which the sparsemax normalization is applied. diff --git a/tensorflow_addons/layers/spatial_pyramid_pooling.py b/tensorflow_addons/layers/spatial_pyramid_pooling.py index dece894f0b..f96c0067eb 100644 --- a/tensorflow_addons/layers/spatial_pyramid_pooling.py +++ b/tensorflow_addons/layers/spatial_pyramid_pooling.py @@ -26,7 +26,7 @@ class SpatialPyramidPooling2D(tf.keras.layers.Layer): """Performs Spatial Pyramid Pooling. - Original Paper: https://arxiv.org/pdf/1406.4729.pdf + See [Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition](https://arxiv.org/pdf/1406.4729.pdf). Spatial Pyramid Pooling generates a fixed-length representation regardless of input size/scale. It is typically used before a layer diff --git a/tensorflow_addons/layers/spectral_normalization.py b/tensorflow_addons/layers/spectral_normalization.py index 5a4c38c837..958137433f 100644 --- a/tensorflow_addons/layers/spectral_normalization.py +++ b/tensorflow_addons/layers/spectral_normalization.py @@ -19,30 +19,33 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class SpectralNormalization(tf.keras.layers.Wrapper): - """This wrapper controls the Lipschitz constant of the layer by - constraining its spectral norm. - This stabilizes the training of GANs. - Spectral Normalization for Generative Adversarial Networks: - https://arxiv.org/abs/1802.05957 - Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida (2018) - SpectralNormalization wrapper works for keras and tf layers. + """Performs spectral normalization on weights. + + This wrapper controls the Lipschitz constant of the layer by + constraining its spectral norm, which can stabilize the training of GANs. + + See [Spectral Normalization for Generative Adversarial Networks](https://arxiv.org/abs/1802.05957). + ```python - net = SpectralNormalization( - tf.keras.layers.Conv2D(2, 2, activation="relu"), - input_shape=(32, 32, 3))(x) - net = SpectralNormalization( - tf.keras.layers.Conv2D(16, 5, activation="relu"))(net) - net = SpectralNormalization( - tf.keras.layers.Dense(120, activation="relu"))(net) - net = SpectralNormalization( - tf.keras.layers.Dense(n_classes))(net) + net = SpectralNormalization( + tf.keras.layers.Conv2D(2, 2, activation="relu"), + input_shape=(32, 32, 3))(x) + net = SpectralNormalization( + tf.keras.layers.Conv2D(16, 5, activation="relu"))(net) + net = SpectralNormalization( + tf.keras.layers.Dense(120, activation="relu"))(net) + net = SpectralNormalization( + tf.keras.layers.Dense(n_classes))(net) ``` + Arguments: - layer: a layer instance. + layer: A `tf.keras.layers.Layer` instance that + has either `kernel` or `embeddings` attribute. + power_iterations: `int`, the number of iterations during normalization. Raises: AssertionError: If not initialized with a `Layer` instance. - ValueError: If initialized with negative `power_iterations` - AttributeError: If `Layer` does not contain a `kernel` or `embeddings` of weights + ValueError: If initialized with negative `power_iterations`. + AttributeError: If `layer` does not has `kernel` or `embeddings` attribute. """ @typechecked @@ -99,8 +102,9 @@ def compute_output_shape(self, input_shape): @tf.function def normalize_weights(self): """Generate spectral normalized weights. - This method will update the value of self.w with the - spectral normalized value, so that the layer is ready for call(). + + This method will update the value of `self.w` with the + spectral normalized value, so that the layer is ready for `call()`. """ w = tf.reshape(self.w, [-1, self.w_shape[-1]]) diff --git a/tensorflow_addons/layers/tlu.py b/tensorflow_addons/layers/tlu.py index 961a1221d7..51a2ddb6cb 100644 --- a/tensorflow_addons/layers/tlu.py +++ b/tensorflow_addons/layers/tlu.py @@ -22,7 +22,9 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class TLU(tf.keras.layers.Layer): - """Thresholded Linear Unit. An activation function which is similar to ReLU + r"""Thresholded Linear Unit. + + An activation function which is similar to ReLU but with a learned threshold that benefits models using FRN(Filter Response Normalization). Original paper: https://arxiv.org/pdf/1911.09737. @@ -35,8 +37,8 @@ class TLU(tf.keras.layers.Layer): Same shape as the input. Arguments: - affine: bool. Whether to make it TLU-Affine or not - which has the form `max(x, alpha*x + tau)` + affine: `bool`. Whether to make it TLU-Affine or not + which has the form $\max(x, \alpha*x + \tau)$` """ @typechecked diff --git a/tensorflow_addons/layers/wrappers.py b/tensorflow_addons/layers/wrappers.py index 811d320f7c..c9f9011aa2 100644 --- a/tensorflow_addons/layers/wrappers.py +++ b/tensorflow_addons/layers/wrappers.py @@ -21,37 +21,38 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class WeightNormalization(tf.keras.layers.Wrapper): - """This wrapper reparameterizes a layer by decoupling the weight's - magnitude and direction. + """Performs weight normalization. + This wrapper reparameterizes a layer by decoupling the weight's + magnitude and direction. This speeds up convergence by improving the conditioning of the optimization problem. - Weight Normalization: A Simple Reparameterization to Accelerate - Training of Deep Neural Networks: https://arxiv.org/abs/1602.07868 - Tim Salimans, Diederik P. Kingma (2016) - WeightNormalization wrapper works for keras and tf layers. + + See [Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks](https://arxiv.org/abs/1602.07868). + ```python - net = WeightNormalization( - tf.keras.layers.Conv2D(2, 2, activation='relu'), - input_shape=(32, 32, 3), - data_init=True)(x) - net = WeightNormalization( - tf.keras.layers.Conv2D(16, 5, activation='relu'), - data_init=True)(net) - net = WeightNormalization( - tf.keras.layers.Dense(120, activation='relu'), - data_init=True)(net) - net = WeightNormalization( - tf.keras.layers.Dense(n_classes), - data_init=True)(net) + net = WeightNormalization( + tf.keras.layers.Conv2D(2, 2, activation='relu'), + input_shape=(32, 32, 3), + data_init=True)(x) + net = WeightNormalization( + tf.keras.layers.Conv2D(16, 5, activation='relu'), + data_init=True)(net) + net = WeightNormalization( + tf.keras.layers.Dense(120, activation='relu'), + data_init=True)(net) + net = WeightNormalization( + tf.keras.layers.Dense(n_classes), + data_init=True)(net) ``` + Arguments: - layer: a layer instance. - data_init: If `True` use data dependent variable initialization + layer: A `tf.keras.layers.Layer` instance. + data_init: If `True` use data dependent variable initialization. Raises: ValueError: If not initialized with a `Layer` instance. - ValueError: If `Layer` does not contain a `kernel` of weights - NotImplementedError: If `data_init` is True and running graph execution + ValueError: If `Layer` does not contain a `kernel` of weights. + NotImplementedError: If `data_init` is True and running graph execution. """ @typechecked