huggingface · jadechoghari · Oct 5, 2023 · Oct 7, 2023 · Oct 8, 2023 · Oct 11, 2023
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
@@ -323,6 +323,7 @@ Flax), PyTorch, and/or TensorFlow.
 |             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
 |                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
 |                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|                       [TextNet](model_doc/textnet)                       |       ✅        |         ❌         |      ❌      |
 |       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
 |                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
 |                [TimmWrapperModel](model_doc/timm_wrapper)                |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md
@@ -0,0 +1,51 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TextNet
+
+## Overview
+
+The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/fast_architecture.png"
+alt="drawing" width="600"/>
+
+<small> TextNet backbone as part of FAST. Taken from the <a href="https://arxiv.org/abs/2111.02394">original paper.</a> </small>
+
+This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr).
+
+This model was contributed by [nandwalritik](https://huggingface.co/nandwalritik).
+The original code can be found [here](https://github.com/czczup/FAST).
+
+## TextNetConfig
+
+[[autodoc]] TextNetConfig
+
+## TextNetImageProcessor
+
+[[autodoc]] TextNetImageProcessor
+    - preprocess
+
+## TextNetModel
+
+[[autodoc]] TextNetModel
+    - forward
+
+## TextNetForImageClassification
+
+[[autodoc]] TextNetForImageClassification
+    - forward
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -785,6 +785,7 @@
         "TapasConfig",
         "TapasTokenizer",
     ],
+    "models.textnet": ["TextNetConfig"],
     "models.time_series_transformer": ["TimeSeriesTransformerConfig"],
     "models.timesformer": ["TimesformerConfig"],
     "models.timm_backbone": ["TimmBackboneConfig"],
@@ -1252,6 +1253,7 @@
     _import_structure["models.siglip"].append("SiglipImageProcessor")
     _import_structure["models.superpoint"].extend(["SuperPointImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
+    _import_structure["models.textnet"].extend(["TextNetImageProcessor"])
     _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.video_llava"].append("VideoLlavaImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
@@ -3551,6 +3553,14 @@
             "load_tf_weights_in_tapas",
         ]
     )
+    _import_structure["models.textnet"].extend(
+        [
+            "TextNetBackbone",
+            "TextNetForImageClassification",
+            "TextNetModel",
+            "TextNetPreTrainedModel",
+        ]
+    )
     _import_structure["models.time_series_transformer"].extend(
         [
             "TimeSeriesTransformerForPrediction",
@@ -5770,6 +5780,7 @@
         TapasConfig,
         TapasTokenizer,
     )
+    from .models.textnet import TextNetConfig
     from .models.time_series_transformer import (
         TimeSeriesTransformerConfig,
     )
@@ -6248,6 +6259,7 @@
         from .models.siglip import SiglipImageProcessor
         from .models.superpoint import SuperPointImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
+        from .models.textnet import TextNetImageProcessor
         from .models.tvp import TvpImageProcessor
         from .models.video_llava import VideoLlavaImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
@@ -8089,6 +8101,12 @@
             TapasPreTrainedModel,
             load_tf_weights_in_tapas,
         )
+        from .models.textnet import (
+            TextNetBackbone,
+            TextNetForImageClassification,
+            TextNetModel,
+            TextNetPreTrainedModel,
+        )
         from .models.time_series_transformer import (
             TimeSeriesTransformerForPrediction,
             TimeSeriesTransformerModel,

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -249,6 +249,7 @@
     t5,
     table_transformer,
     tapas,
+    textnet,
     time_series_transformer,
     timesformer,
     timm_backbone,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -276,6 +276,7 @@
         ("t5", "T5Config"),
         ("table-transformer", "TableTransformerConfig"),
         ("tapas", "TapasConfig"),
+        ("textnet", "TextNetConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
         ("timesformer", "TimesformerConfig"),
         ("timm_backbone", "TimmBackboneConfig"),
@@ -604,6 +605,7 @@
         ("table-transformer", "Table Transformer"),
         ("tapas", "TAPAS"),
         ("tapex", "TAPEX"),
+        ("textnet", "TextNet"),
         ("time_series_transformer", "Time Series Transformer"),
         ("timesformer", "TimeSformer"),
         ("timm_backbone", "TimmBackbone"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -254,6 +254,7 @@
         ("t5", "T5Model"),
         ("table-transformer", "TableTransformerModel"),
         ("tapas", "TapasModel"),
+        ("textnet", "TextNetModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
         ("timesformer", "TimesformerModel"),
         ("timm_backbone", "TimmBackbone"),
@@ -697,6 +698,7 @@
         ("swiftformer", "SwiftFormerForImageClassification"),
         ("swin", "SwinForImageClassification"),
         ("swinv2", "Swinv2ForImageClassification"),
+        ("textnet", "TextNetForImageClassification"),
         ("timm_wrapper", "TimmWrapperForImageClassification"),
         ("van", "VanForImageClassification"),
         ("vit", "ViTForImageClassification"),
@@ -1378,6 +1380,7 @@
         ("rt_detr_resnet", "RTDetrResNetBackbone"),
         ("swin", "SwinBackbone"),
         ("swinv2", "Swinv2Backbone"),
+        ("textnet", "TextNetBackbone"),
         ("timm_backbone", "TimmBackbone"),
         ("vitdet", "VitDetBackbone"),
     ]

diff --git a/src/transformers/models/textnet/__init__.py b/src/transformers/models/textnet/__init__.py
@@ -0,0 +1,74 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ... import is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_textnet": ["TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "TextNetConfig"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_textnet"] = ["TextNetImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_textnet"] = [
+        "TextNetBackbone",
+        "TextNetModel",
+        "TextNetPreTrainedModel",
+        "TextNetForImageClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_textnet import TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, TextNetConfig
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_textnet import TextNetImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_textnet import (
+            TextNetBackbone,
+            TextNetForImageClassification,
+            TextNetModel,
+            TextNetPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/textnet/configuration_textnet.py b/src/transformers/models/textnet/configuration_textnet.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2024 the Fast authors and HuggingFace Inc. team.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TextNet model configuration"""
+
+from transformers import PretrainedConfig
+from transformers.utils import logging
+from transformers.utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+TEXTNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "textnet-base": ("https://huggingface.co/Raghavan/textnet-base/blob/main/config.json"),
+}
+
+
+class TextNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TextNextModel`]. It is used to instantiate a
+    TextNext model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base). Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs.Read the documentation from [`PretrainedConfig`]
+    for more information.
+
+    Args:
+        stem_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size for the initial convolution layer.
+        stem_stride (`int`, *optional*, defaults to 2):
+            The stride for the initial convolution layer.
+        stem_num_channels (`int`, *optional*, defaults to 3):
+            The num of channels in input for the initial convolution layer.
+        stem_out_channels (`int`, *optional*, defaults to 64):
+            The num of channels in out for the initial convolution layer.
+        stem_act_func (`str`, *optional*, defaults to `"relu"`):
+            The activation function for the initial convolution layer.
+        image_size (`Tuple[int, int]`, *optional*, defaults to `[640, 640]`):
+            The size (resolution) of each image.
+        conv_layer_kernel_sizes (`List[List[List[int]]]`, *optional*):
+            A list of stage-wise kernel sizes. If `None`, defaults to:
+            `[[[3, 3], [3, 3], [3, 3]], [[3, 3], [1, 3], [3, 3], [3, 1]], [[3, 3], [3, 3], [3, 1], [1, 3]], [[3, 3], [3, 1], [1, 3], [3, 3]]]`.
+        conv_layer_strides (`List[List[int]]`, *optional*):
+            A list of stage-wise strides. If `None`, defaults to:
+            `[[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]`.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[64, 64, 128, 256, 512]`):
+            Dimensionality (hidden size) at each stage.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+
+    Examples:
+
+    ```python
+    >>> from transformers import TextNetConfig, TextNetBackbone
+
+    >>> # Initializing a TextNetConfig
+    >>> configuration = TextNetConfig()
+
+    >>> # Initializing a model (with random weights)
+    >>> model = TextNetBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    r"""
+    [Raghavan/textnet-base](https://huggingface.co/Raghavan/textnet-base)
+    """
+    model_type = "textnet"
+
+    def __init__(
+        self,
+        stem_kernel_size=3,
+        stem_stride=2,
+        stem_num_channels=3,
+        stem_out_channels=64,
+        stem_act_func="relu",
+        image_size=[640, 640],
+        conv_layer_kernel_sizes=None,
+        conv_layer_strides=None,
+        hidden_sizes=[64, 64, 128, 256, 512],
+        batch_norm_eps=1e-5,
+        initializer_range=0.02,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if conv_layer_kernel_sizes is None:
+            conv_layer_kernel_sizes = [
+                [[3, 3], [3, 3], [3, 3]],
+                [[3, 3], [1, 3], [3, 3], [3, 1]],
+                [[3, 3], [3, 3], [3, 1], [1, 3]],
+                [[3, 3], [3, 1], [1, 3], [3, 3]],
+            ]
+        if conv_layer_strides is None:
+            conv_layer_strides = [[1, 2, 1], [2, 1, 1, 1], [2, 1, 1, 1], [2, 1, 1, 1]]
+
+        self.stem_kernel_size = stem_kernel_size
+        self.stem_stride = stem_stride
+        self.stem_num_channels = stem_num_channels
+        self.stem_out_channels = stem_out_channels
+        self.stem_act_func = stem_act_func
+
+        self.image_size = image_size
+        self.conv_layer_kernel_sizes = conv_layer_kernel_sizes
+        self.conv_layer_strides = conv_layer_strides
+
+        self.initializer_range = initializer_range
+        self.hidden_sizes = hidden_sizes
+        self.batch_norm_eps = batch_norm_eps
+
+        self.depths = [len(layer) for layer in self.conv_layer_kernel_sizes]
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, 5)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )