SforAiDl · abhi-glitchhg · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022 · Feb 8, 2022
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -9,24 +9,24 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-           
+
     steps:
-      
+
       - uses: actions/cache@v2
         if: startsWith(runner.os, 'Linux')
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
           restore-keys: |
             ${{ runner.os }}-pip-
-      
+
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
         uses: actions/setup-python@v2
         with:
           python-version: 3.7
 
-      
+
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -36,10 +36,11 @@ jobs:
       - name: Code coverage test
         run: pytest --cov-report xml --cov='./vformer/' --cov-config=.coveragerc
 
+
       - name: Generate report using codecov
         uses: codecov/codecov-action@v1
         if: always()
         with:
           fail_ci_if_error: false
           file: coverage.xml
-          env_vars: OS,PYTHON
+          env_vars: OS,PYTHON
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -32,4 +32,11 @@ jobs:
     - name: Run pre-commit hooks
       run: |
         pre-commit install
-        pre-commit run -a
+        pre-commit run -a
+
+    - name: .RST Code Format check
+      run: |
+        pip install rstcheck
+        cd docs
+        cd Tutorial
+        rstcheck Tutorial.rst
diff --git a/docs/Tutorial/Tutorial.rst b/docs/Tutorial/Tutorial.rst
@@ -0,0 +1,150 @@
+============================================
+Building Swin Transformer Model with VFormer
+============================================
+
+In this tutorial, we will guide you to build Swin Transformer with the building blocks available in the library.
+
+In general, many of vision transformers have 3 main building blocks,
+1) Patch Embedding
+2) Encoder
+3) Decoder
+
+Now let's look at Architecture of Swin Transformer
+
+.. image::  ./images/Swin Transformer Architecture.jpg
+
+
+
+
+So  first step is that image is partitioned into patches and then projected into an embedding.
+
+These patches can be overlapping or can be non-overlapping; There are different methods to extract these embeddings. Please see the embedding module for more insight.
+
+.. code-block:: python
+
+    import torch.nn as nn
+
+    from vformer.encoder.embedding import PatchEmbedding
+
+    patch_embedding = PatchEmbedding(
+                              image_size = 224,#Insert size of an Image,
+                              patch_size = 4,#Insert size of a single patch, make sure that image_size is divisible by patch_size ,
+                              in_channels = 3,#Insert number of input channels,  for rgb image this value is 3; for grayscale this value is 1
+                              embedding_dim = 96,#Insert number of dimensions of the embedding, every patch will be projected into the embedding space having `embedding_dim` dimensions
+                              norm_layer = nn.LayerNorm #Normalisation layer object
+                              )
+
+
+
+
+These embeddings are then encoded with Swin Encoder block. Swin Encoder block consists of Multi-Head-Self-Attention(MHSA) followed by Multi Layer Perceptron(MLP). I am not going in detail how the encoder is implemented;
+
+importing swin Encoder
+
+.. code-block:: python
+
+    import torch.nn as nn
+
+    from vformer.encoder import SwinEncoder
+    from vformer.functional import PatchMerging
+
+    depths=[2,2,6,2]
+    num_heads=[3,6,12,24]
+    input_resolution = patch_embedding.input_resolution
+    num_stages,embedding_dim,window_size=(4,96,7)
+
+    for i in range(num_stages):
+        swin_encoder.append(SwinEncoder(
+                dim= embedding_dim * 2**i ,#Dimension of the embedding at ith stage,
+                input_resolution= (
+                    (patch_resolution[0] // (2 ** i)),
+                    patch_resolution[1] // (2 ** i),), #Resolution of patches at ith stage,
+                depth=depths[i],
+                num_heads=num_heads[i],#Number of Attention heads at ith stage ,
+                window_size=window_size,#Insert window size, refert to window-self attention for more insight
+                norm_layer=nn.LayerNorm,#Normalisation layer object,
+                downsample = PatchMerging, # in the last stage nn.Identity should be used
+                ))
+     #This swin_encoder ModuleList  contains all 4 stages and patchmerging blocks
+
+This encodeded tensors are then passed through Decoder for classification
+
+.. code-block:: python
+
+    from vformer.decoder import MLPDecoder
+
+    decoder = MLPDecoder(config =[768,256,32], #List of decoding dimensions,
+                         n_classes =10) #Number of classes
+
+Now putting it all together
+
+.. code-block:: python
+
+    import torch
+    import torch.nn as nn
+
+    from vformer.encoder import SwinEncoder
+    from vformer.encoder.embedding import PatchEmbedding
+    from vformer.decoder import MLPDecoder
+    from vformer.functional import PatchMerging
+
+    class SwinTransformer(nn.Module):
+        def __init__(self,
+                    img_size=224, patch_size=4, in_channels=3,
+                    n_classes=10, embedding_dim=96, depths=[2, 2, 6, 2],
+                    num_heads=[3, 6, 12, 24], window_size=7,
+                    mlp_ratio=4.0, norm_layer=nn.LayerNorm,
+                    decoder_config=[768,256,32,10], patch_norm=True,):
+            super().__init__()
+            self.patch_embed = PatchEmbedding(
+                    img_size=img_size,
+                    patch_size=patch_size,
+                    in_channels=in_channels,
+                    embedding_dim=embedding_dim,
+                    norm_layer=norm_layer,
+            )
+            self.patch_resolution = self.patch_embed.patch_resolution
+
+            self.encoder = nn.ModuleList()
+
+            for i_layer in range(len(depths)):
+                layer = SwinEncoder(
+                dim=int(embedding_dim * (2 ** i_layer)),
+                input_resolution=(
+                    (self.patch_resolution[0] // (2 ** i_layer)),
+                    self.patch_resolution[1] // (2 ** i_layer),),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                norm_layer=norm_layer,
+                downsample=PatchMerging if i_layer < len(depths) - 1 else None,
+                )
+                self.encoder.append(layer)
+
+            self.pool = nn.AdaptiveAvgPool1d(1)
+            self.decoder = MLPDecoder(config=decoder_config,n_classes=n_classes)
+        def forward(self,x):
+            #forward pass
+
+            x = self.patch_embed(x)
+            for layer in self.encoder:
+                x=layer(x)
+
+            x = self.pool(x.transpose(1, 2)).flatten(1)
+            return self.decoder(x)
+
+
+    model = SwinTransformer()
+
+Some popular Vision Transformer models are already implemented in VFormer, you can use them directly from `vformer/models` directory
+
+for eg.
+
+.. code-block:: python
+
+    from vformer.models import SwinTransformer
+
+    model = SwinTransformer(img_size = 224,patch_size=4,in_channels=3,window_size=7,n_classes=10)
+
+
diff --git a/docs/Tutorial/images/SwinTransformerArchitecture.jpg b/docs/Tutorial/images/SwinTransformerArchitecture.jpg