From 20db7555feb9f9db7d1c6ac3bd8a4b7684b8e849 Mon Sep 17 00:00:00 2001
From: Alexander Kurz <alex.kurz95@gmail.com>
Date: Tue, 5 Dec 2023 16:37:17 +0100
Subject: [PATCH] Added pre-commit and adjusted files

---
 .gitignore                          |   2 +-
 .pre-commit-config.yaml             | 148 ++++++++++++++++++++++++++++
 README.md                           |  69 ++++++-------
 configs/mco_256.json                |   2 +-
 configs/mco_512.json                |   2 +-
 configs/tcga-crc_256.json           |   2 +-
 configs/tcga-crc_512.json           |   2 +-
 count_tiles.py                      |   8 +-
 environment.yml => environment.yaml |   4 +-
 tile_generator.py                   |  28 +++---
 tissue_detection.py                 |   6 +-
 11 files changed, 209 insertions(+), 64 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 rename environment.yml => environment.yaml (90%)

diff --git a/.gitignore b/.gitignore
index 2e149ae..e1b13d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,4 @@ __pycache__
 .idea/*
 
 # VSCode
-.vscode
\ No newline at end of file
+.vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..cdcfa3c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,148 @@
+default_language_version:
+  python: python3
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      # list of supported hooks: https://pre-commit.com/hooks.html
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-docstring-first
+      - id: check-yaml
+      - id: debug-statements
+      - id: detect-private-key
+      - id: check-executables-have-shebangs
+      - id: check-toml
+      - id: check-case-conflict
+      - id: check-added-large-files
+
+  # python code formatting
+  - repo: https://github.com/psf/black
+    rev: 23.1.0
+    hooks:
+      - id: black
+        args: [--line-length, "120"]
+
+  # python import sorting
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black", "--filter-files"]
+
+  # python upgrading syntax to newer version
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.3.1
+    hooks:
+      - id: pyupgrade
+        args: [--py38-plus]
+
+  # python docstring formatting
+  - repo: https://github.com/myint/docformatter
+    rev: v1.7.4
+    hooks:
+      - id: docformatter
+        args:
+          [
+            --in-place,
+            --wrap-summaries=99,
+            --wrap-descriptions=99,
+            --style=sphinx,
+            --black,
+          ]
+
+  # python docstring coverage checking
+  # - repo: https://github.com/econchick/interrogate
+  #   rev: 1.5.0 # or master if you're bold
+  #   hooks:
+  #     - id: interrogate
+  #       args:
+  #         [
+  #           --verbose,
+  #           --fail-under=80,
+  #           --ignore-init-module,
+  #           --ignore-init-method,
+  #           --ignore-module,
+  #           --ignore-nested-functions,
+  #           -vv,
+  #         ]
+
+  # python check (PEP8), programming errors and code complexity
+  # - repo: https://github.com/PyCQA/flake8
+  #   rev: 6.0.0
+  #   hooks:
+  #     - id: flake8
+  #       args:
+  #         [
+  #           "--extend-ignore",
+  #           "E203,E402,E501,F401,F841,RST2,RST301",
+  #           "--exclude",
+  #           "logs/*,data/*",
+  #         ]
+  #       additional_dependencies: [flake8-rst-docstrings==0.3.0]
+
+  # python security linter
+  # - repo: https://github.com/PyCQA/bandit
+  #   rev: "1.7.5"
+  #   hooks:
+  #     - id: bandit
+  #       args: ["-s", "B101"]
+
+  # yaml formatting
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.0.0-alpha.6
+    hooks:
+      - id: prettier
+        types: [yaml]
+        exclude: "environment.yaml"
+
+  # shell scripts linter
+  - repo: https://github.com/shellcheck-py/shellcheck-py
+    rev: v0.9.0.2
+    hooks:
+      - id: shellcheck
+
+  # md formatting
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.17
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-gfm
+          - mdformat-frontmatter
+          - mdformat-myst # Required for inline math, see https://github.com/executablebooks/mdformat
+          - mdformat-tables
+          # - mdformat-black
+          # - mdformat-toc
+
+  # word spelling linter
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+      - id: codespell
+        args:
+          - --skip=logs/**,data/**,*.ipynb
+          # - --ignore-words-list=abc,def
+
+  # jupyter notebook cell output clearing
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.6.1
+    hooks:
+      - id: nbstripout
+
+  # jupyter notebook linting
+  - repo: https://github.com/nbQA-dev/nbQA
+    rev: 1.6.3
+    hooks:
+      - id: nbqa-black
+        args: ["--line-length=99"]
+      - id: nbqa-isort
+        args: ["--profile=black"]
+      - id: nbqa-flake8
+        args:
+          [
+            "--extend-ignore=E203,E402,E501,F401,F841",
+            "--exclude=logs/*,data/*",
+          ]
diff --git a/README.md b/README.md
index 836bf0e..96b8572 100644
--- a/README.md
+++ b/README.md
@@ -14,10 +14,11 @@ conda activate wsi-pre2
 
 ## Run tiling with provided config files
 
-The main script to run is `tile_generator.py`. We provide configs in the `configs/` folder which generate tables of patch locations with the corresponding pixel sizes. The tables are then stored as `.csv` files for each slide in the configured `output_path`. 
+The main script to run is `tile_generator.py`. We provide configs in the `configs/` folder which generate tables of patch locations with the corresponding pixel sizes. The tables are then stored as `.csv` files for each slide in the configured `output_path`.
 By default multiprocessing is enabled, such that multiple slides can be processed simultaneously.
 
 As example the tiling of TCGA slides with `patch_size=256` can be started as follows:
+
 ```bash
 python tile_generator.py --config configs/tcga-crc_256.json
 ```
@@ -26,36 +27,36 @@ python tile_generator.py --config configs/tcga-crc_256.json
 
 The table shows descriptions for the most important config parameters:
 
-| Dictionary Entry | Description |
-| ----------- | ----------- |
-| check_resolution | Perform a resolution check of all slides before extracting patches |
-| use_tissue_detection | Toggle the activation of tissue detection |
-| remove_top_border | Useful for Camelyon slides. Default is false |
-| save_patches | In old pipelines we used to store patches. In this project the default is false |
-| zip_patches | Experimental to try if zipped patch image directories increase transfer speeds. Default is false. |
-| tissue_coverage | Threshold [0,1] for how much tissue coverage is necessary, default is 0.8|
-| processing_level | Level of downscaling by openslide - Lowering the level will increase precision but more time is needed, default is 3 | 
-| blocked_threads |Number of threads that wont be used by the program|
-| patches_per_tile | Number of patches used for lower resolution operations like tissue detection | 
-| overlap | Value [0,1] to set the overlap between neighbouring unannotated patches |
-| annotation_overlap | Value [0,1] to set the overlap between neighbouring annotated patches | 
-| patch_size | Output pixel size of the quadratic patches |
-| calibration | |
-| use_non_pixel_lengths | Activate calibration and use micrometers instead of pixels |
-| patch_size_microns | Specify the patch size in micrometers. At 0.25 $\mu\text{m}$ / pixel, 64 $\mu\text{m}$ equal 256 pixels |
-| resize | Whether to resize the patches in micrometers to the given patch_size |
-| dataset | Provide name for the dataset |
-| slides_dir | Directory where the different slides and subdirs are located  | 
-| slideinfo_file | Provide a .csv file with filenames and labels | 
-| annotation_dir | Directory where the annotations are located |
-| annotation_file_format | File format of the input annotations ("xml","geojson")| 
-| output_path | Output directory to where the resulting files will be stored |
-| skip_unlabeled_slides | Boolean to skip slides without an annotation file | 
-| save_annotated_only | Boolean to only save annotated patches |
-| output_format | Image output format. Either "jpeg" or "png" |
-| metadata_format | Format in which slide metadata is stored. Default is "csv" |
-| write_slideinfo | Write information about the processed slide | 
-| show_mode | Boolean to enable plotting of some intermediate results/visualizations | 
-| label_dict |  Structure to set up the operator and the threshold for checking the coverage of a certain class|
-| type | Operator type [ "==", ">=", "<="]| 
-| threshold | Coverage threshold for the individual class |
\ No newline at end of file
+| Dictionary Entry       | Description                                                                                                          |
+| ---------------------- | -------------------------------------------------------------------------------------------------------------------- |
+| check_resolution       | Perform a resolution check of all slides before extracting patches                                                   |
+| use_tissue_detection   | Toggle the activation of tissue detection                                                                            |
+| remove_top_border      | Useful for Camelyon slides. Default is false                                                                         |
+| save_patches           | In old pipelines we used to store patches. In this project the default is false                                      |
+| zip_patches            | Experimental to try if zipped patch image directories increase transfer speeds. Default is false.                    |
+| tissue_coverage        | Threshold \[0,1\] for how much tissue coverage is necessary, default is 0.8                                          |
+| processing_level       | Level of downscaling by openslide - Lowering the level will increase precision but more time is needed, default is 3 |
+| blocked_threads        | Number of threads that won't be used by the program                                                                  |
+| patches_per_tile       | Number of patches used for lower resolution operations like tissue detection                                         |
+| overlap                | Value \[0,1\] to set the overlap between neighbouring unannotated patches                                            |
+| annotation_overlap     | Value \[0,1\] to set the overlap between neighbouring annotated patches                                              |
+| patch_size             | Output pixel size of the quadratic patches                                                                           |
+| calibration            |                                                                                                                      |
+| use_non_pixel_lengths  | Activate calibration and use micrometers instead of pixels                                                           |
+| patch_size_microns     | Specify the patch size in micrometers. At 0.25 $\mu\text{m}$ / pixel, 64 $\mu\text{m}$ equal 256 pixels              |
+| resize                 | Whether to resize the patches in micrometers to the given patch_size                                                 |
+| dataset                | Provide name for the dataset                                                                                         |
+| slides_dir             | Directory where the different slides and subdirs are located                                                         |
+| slideinfo_file         | Provide a .csv file with filenames and labels                                                                        |
+| annotation_dir         | Directory where the annotations are located                                                                          |
+| annotation_file_format | File format of the input annotations ("xml","geojson")                                                               |
+| output_path            | Output directory to where the resulting files will be stored                                                         |
+| skip_unlabeled_slides  | Boolean to skip slides without an annotation file                                                                    |
+| save_annotated_only    | Boolean to only save annotated patches                                                                               |
+| output_format          | Image output format. Either "jpeg" or "png"                                                                          |
+| metadata_format        | Format in which slide metadata is stored. Default is "csv"                                                           |
+| write_slideinfo        | Write information about the processed slide                                                                          |
+| show_mode              | Boolean to enable plotting of some intermediate results/visualizations                                               |
+| label_dict             | Structure to set up the operator and the threshold for checking the coverage of a certain class                      |
+| type                   | Operator type \["==", ">=", "\<="\]                                                                                  |
+| threshold              | Coverage threshold for the individual class                                                                          |
diff --git a/configs/mco_256.json b/configs/mco_256.json
index 07c6946..923710d 100644
--- a/configs/mco_256.json
+++ b/configs/mco_256.json
@@ -40,4 +40,4 @@
             "annotated": true
         }
     }
-}
\ No newline at end of file
+}
diff --git a/configs/mco_512.json b/configs/mco_512.json
index e7e0ced..e503641 100644
--- a/configs/mco_512.json
+++ b/configs/mco_512.json
@@ -40,4 +40,4 @@
             "annotated": true
         }
     }
-}
\ No newline at end of file
+}
diff --git a/configs/tcga-crc_256.json b/configs/tcga-crc_256.json
index 535ed84..acc6edb 100644
--- a/configs/tcga-crc_256.json
+++ b/configs/tcga-crc_256.json
@@ -40,4 +40,4 @@
             "annotated": true
         }
     }
-}
\ No newline at end of file
+}
diff --git a/configs/tcga-crc_512.json b/configs/tcga-crc_512.json
index b67e8a8..ea9c0eb 100644
--- a/configs/tcga-crc_512.json
+++ b/configs/tcga-crc_512.json
@@ -40,4 +40,4 @@
             "annotated": true
         }
     }
-}
\ No newline at end of file
+}
diff --git a/count_tiles.py b/count_tiles.py
index de3e39b..a7ca4e3 100644
--- a/count_tiles.py
+++ b/count_tiles.py
@@ -1,7 +1,7 @@
+import json
 import os
-from pathlib import Path
 from argparse import ArgumentParser
-import json
+from pathlib import Path
 
 script_dir = os.path.dirname(os.path.realpath(__file__))
 
@@ -21,11 +21,11 @@ def main(config_path):
         n_other = len(os.listdir(os.path.join(slide_path, "non_tumor")))
         n_total = n_tumor + n_other
         frac = n_tumor / n_total * 100
-        print("{}: {}/{}, {:.4f}% tumor tiles of total tiles, name: {}".format(i, n_tumor, n_total, frac, slide))
+        print(f"{i}: {n_tumor}/{n_total}, {frac:.4f}% tumor tiles of total tiles, name: {slide}")
         total_tumor += n_tumor
         total_tiles += n_total
     frac = total_tumor / total_tiles * 100
-    print("Total: {}/{}, {:.4f}% tumor tiles of total tiles".format(total_tumor, total_tiles, frac))
+    print(f"Total: {total_tumor}/{total_tiles}, {frac:.4f}% tumor tiles of total tiles")
 
 
 if __name__ == "__main__":
diff --git a/environment.yml b/environment.yaml
similarity index 90%
rename from environment.yml
rename to environment.yaml
index d3a3b78..392367d 100644
--- a/environment.yml
+++ b/environment.yaml
@@ -7,11 +7,11 @@ dependencies:
   - python=3.9.*
   - openslide=3.4.1  # C-Library, latest release from 2015
   - openslide-python=1.2.0  # Python binding, release from 2022
+  - pre-commit=3.*
   - pip:
-    - black==23.3.* 
     - opencv-python==4.5.5.62
     - Pillow==9.5.0
     - pandas==1.5.2  # Before switching from Numpy to Apache Arrow backend
     - numpy==1.24.4
     - matplotlib==3.7.*
-    - tqdm==4.66.1
\ No newline at end of file
+    - tqdm==4.66.1
diff --git a/tile_generator.py b/tile_generator.py
index a5b9e09..4275bf7 100644
--- a/tile_generator.py
+++ b/tile_generator.py
@@ -1,24 +1,20 @@
 # System
+import json
+import multiprocessing
 import os
-from pathlib import Path
 import shutil
-import zipfile
-
-# Advanced
-import multiprocessing
 import xml.etree.ElementTree as ET
-import json
-from tqdm import tqdm
+import zipfile
+from pathlib import Path
 
-# Image Processing
-import openslide
-from PIL import Image
+# Libraries
 import cv2
-
-# Data
-import pandas as pd
-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
+import openslide
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
 
 # Custom
 import tissue_detection
@@ -474,7 +470,7 @@ def extract_patches(
             if self.annotated_only and not tile_dict[tile_key]["annotated"]:
                 pass
             else:
-                # ToDo: rows and cols arent calculated correctly, instead a quick fix by using breaks was applied
+                # TODO: rows and cols aren't calculated correctly, instead a quick fix by using breaks was applied
 
                 tile_x = tile_dict[tile_key]["x"] * scaling_factor
                 tile_y = tile_dict[tile_key]["y"] * scaling_factor
@@ -666,7 +662,7 @@ def save_thumbnail(self, mask, slide_name, level, output_format="png", save_mask
             plt.imsave(file_name, img, format=output_format)
 
     def init_generic_tiff(self):
-        unit_dict = {"milimeter": 1000, "centimeter": 10000, "meter": 1000000}
+        unit_dict = {"millimeter": 1000, "centimeter": 10000, "meter": 1000000}
         scanner = "generic-tiff"
 
         assert self.slide.properties["tiff.ResolutionUnit"] in unit_dict.keys(), (
diff --git a/tissue_detection.py b/tissue_detection.py
index fa55403..f7c11f0 100644
--- a/tissue_detection.py
+++ b/tissue_detection.py
@@ -1,11 +1,11 @@
-import numpy as np
-import matplotlib.pyplot as plt
 import copy
+
 import cv2
+import matplotlib.pyplot as plt
+import numpy as np
 
 
 def tissue_detection(img, remove_top_border: bool = False):
-
     kernel_size = 3
 
     # remove alpha channel