From 20db7555feb9f9db7d1c6ac3bd8a4b7684b8e849 Mon Sep 17 00:00:00 2001 From: Alexander Kurz Date: Tue, 5 Dec 2023 16:37:17 +0100 Subject: [PATCH] Added pre-commit and adjusted files --- .gitignore | 2 +- .pre-commit-config.yaml | 148 ++++++++++++++++++++++++++++ README.md | 69 ++++++------- configs/mco_256.json | 2 +- configs/mco_512.json | 2 +- configs/tcga-crc_256.json | 2 +- configs/tcga-crc_512.json | 2 +- count_tiles.py | 8 +- environment.yml => environment.yaml | 4 +- tile_generator.py | 28 +++--- tissue_detection.py | 6 +- 11 files changed, 209 insertions(+), 64 deletions(-) create mode 100644 .pre-commit-config.yaml rename environment.yml => environment.yaml (90%) diff --git a/.gitignore b/.gitignore index 2e149ae..e1b13d1 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,4 @@ __pycache__ .idea/* # VSCode -.vscode \ No newline at end of file +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..cdcfa3c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,148 @@ +default_language_version: + python: python3 + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + # list of supported hooks: https://pre-commit.com/hooks.html + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-docstring-first + - id: check-yaml + - id: debug-statements + - id: detect-private-key + - id: check-executables-have-shebangs + - id: check-toml + - id: check-case-conflict + - id: check-added-large-files + + # python code formatting + - repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + args: [--line-length, "120"] + + # python import sorting + - repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files"] + + # python upgrading syntax to newer version + - repo: https://github.com/asottile/pyupgrade + rev: v3.3.1 + hooks: + - id: pyupgrade + args: [--py38-plus] + + # python docstring formatting + - repo: https://github.com/myint/docformatter + rev: v1.7.4 + hooks: + - id: docformatter + args: + [ + --in-place, + --wrap-summaries=99, + --wrap-descriptions=99, + --style=sphinx, + --black, + ] + + # python docstring coverage checking + # - repo: https://github.com/econchick/interrogate + # rev: 1.5.0 # or master if you're bold + # hooks: + # - id: interrogate + # args: + # [ + # --verbose, + # --fail-under=80, + # --ignore-init-module, + # --ignore-init-method, + # --ignore-module, + # --ignore-nested-functions, + # -vv, + # ] + + # python check (PEP8), programming errors and code complexity + # - repo: https://github.com/PyCQA/flake8 + # rev: 6.0.0 + # hooks: + # - id: flake8 + # args: + # [ + # "--extend-ignore", + # "E203,E402,E501,F401,F841,RST2,RST301", + # "--exclude", + # "logs/*,data/*", + # ] + # additional_dependencies: [flake8-rst-docstrings==0.3.0] + + # python security linter + # - repo: https://github.com/PyCQA/bandit + # rev: "1.7.5" + # hooks: + # - id: bandit + # args: ["-s", "B101"] + + # yaml formatting + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0-alpha.6 + hooks: + - id: prettier + types: [yaml] + exclude: "environment.yaml" + + # shell scripts linter + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.9.0.2 + hooks: + - id: shellcheck + + # md formatting + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-gfm + - mdformat-frontmatter + - mdformat-myst # Required for inline math, see https://github.com/executablebooks/mdformat + - mdformat-tables + # - mdformat-black + # - mdformat-toc + + # word spelling linter + - repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + args: + - --skip=logs/**,data/**,*.ipynb + # - --ignore-words-list=abc,def + + # jupyter notebook cell output clearing + - repo: https://github.com/kynan/nbstripout + rev: 0.6.1 + hooks: + - id: nbstripout + + # jupyter notebook linting + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.6.3 + hooks: + - id: nbqa-black + args: ["--line-length=99"] + - id: nbqa-isort + args: ["--profile=black"] + - id: nbqa-flake8 + args: + [ + "--extend-ignore=E203,E402,E501,F401,F841", + "--exclude=logs/*,data/*", + ] diff --git a/README.md b/README.md index 836bf0e..96b8572 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,11 @@ conda activate wsi-pre2 ## Run tiling with provided config files -The main script to run is `tile_generator.py`. We provide configs in the `configs/` folder which generate tables of patch locations with the corresponding pixel sizes. The tables are then stored as `.csv` files for each slide in the configured `output_path`. +The main script to run is `tile_generator.py`. We provide configs in the `configs/` folder which generate tables of patch locations with the corresponding pixel sizes. The tables are then stored as `.csv` files for each slide in the configured `output_path`. By default multiprocessing is enabled, such that multiple slides can be processed simultaneously. As example the tiling of TCGA slides with `patch_size=256` can be started as follows: + ```bash python tile_generator.py --config configs/tcga-crc_256.json ``` @@ -26,36 +27,36 @@ python tile_generator.py --config configs/tcga-crc_256.json The table shows descriptions for the most important config parameters: -| Dictionary Entry | Description | -| ----------- | ----------- | -| check_resolution | Perform a resolution check of all slides before extracting patches | -| use_tissue_detection | Toggle the activation of tissue detection | -| remove_top_border | Useful for Camelyon slides. Default is false | -| save_patches | In old pipelines we used to store patches. In this project the default is false | -| zip_patches | Experimental to try if zipped patch image directories increase transfer speeds. Default is false. | -| tissue_coverage | Threshold [0,1] for how much tissue coverage is necessary, default is 0.8| -| processing_level | Level of downscaling by openslide - Lowering the level will increase precision but more time is needed, default is 3 | -| blocked_threads |Number of threads that wont be used by the program| -| patches_per_tile | Number of patches used for lower resolution operations like tissue detection | -| overlap | Value [0,1] to set the overlap between neighbouring unannotated patches | -| annotation_overlap | Value [0,1] to set the overlap between neighbouring annotated patches | -| patch_size | Output pixel size of the quadratic patches | -| calibration | | -| use_non_pixel_lengths | Activate calibration and use micrometers instead of pixels | -| patch_size_microns | Specify the patch size in micrometers. At 0.25 $\mu\text{m}$ / pixel, 64 $\mu\text{m}$ equal 256 pixels | -| resize | Whether to resize the patches in micrometers to the given patch_size | -| dataset | Provide name for the dataset | -| slides_dir | Directory where the different slides and subdirs are located | -| slideinfo_file | Provide a .csv file with filenames and labels | -| annotation_dir | Directory where the annotations are located | -| annotation_file_format | File format of the input annotations ("xml","geojson")| -| output_path | Output directory to where the resulting files will be stored | -| skip_unlabeled_slides | Boolean to skip slides without an annotation file | -| save_annotated_only | Boolean to only save annotated patches | -| output_format | Image output format. Either "jpeg" or "png" | -| metadata_format | Format in which slide metadata is stored. Default is "csv" | -| write_slideinfo | Write information about the processed slide | -| show_mode | Boolean to enable plotting of some intermediate results/visualizations | -| label_dict | Structure to set up the operator and the threshold for checking the coverage of a certain class| -| type | Operator type [ "==", ">=", "<="]| -| threshold | Coverage threshold for the individual class | \ No newline at end of file +| Dictionary Entry | Description | +| ---------------------- | -------------------------------------------------------------------------------------------------------------------- | +| check_resolution | Perform a resolution check of all slides before extracting patches | +| use_tissue_detection | Toggle the activation of tissue detection | +| remove_top_border | Useful for Camelyon slides. Default is false | +| save_patches | In old pipelines we used to store patches. In this project the default is false | +| zip_patches | Experimental to try if zipped patch image directories increase transfer speeds. Default is false. | +| tissue_coverage | Threshold \[0,1\] for how much tissue coverage is necessary, default is 0.8 | +| processing_level | Level of downscaling by openslide - Lowering the level will increase precision but more time is needed, default is 3 | +| blocked_threads | Number of threads that won't be used by the program | +| patches_per_tile | Number of patches used for lower resolution operations like tissue detection | +| overlap | Value \[0,1\] to set the overlap between neighbouring unannotated patches | +| annotation_overlap | Value \[0,1\] to set the overlap between neighbouring annotated patches | +| patch_size | Output pixel size of the quadratic patches | +| calibration | | +| use_non_pixel_lengths | Activate calibration and use micrometers instead of pixels | +| patch_size_microns | Specify the patch size in micrometers. At 0.25 $\mu\text{m}$ / pixel, 64 $\mu\text{m}$ equal 256 pixels | +| resize | Whether to resize the patches in micrometers to the given patch_size | +| dataset | Provide name for the dataset | +| slides_dir | Directory where the different slides and subdirs are located | +| slideinfo_file | Provide a .csv file with filenames and labels | +| annotation_dir | Directory where the annotations are located | +| annotation_file_format | File format of the input annotations ("xml","geojson") | +| output_path | Output directory to where the resulting files will be stored | +| skip_unlabeled_slides | Boolean to skip slides without an annotation file | +| save_annotated_only | Boolean to only save annotated patches | +| output_format | Image output format. Either "jpeg" or "png" | +| metadata_format | Format in which slide metadata is stored. Default is "csv" | +| write_slideinfo | Write information about the processed slide | +| show_mode | Boolean to enable plotting of some intermediate results/visualizations | +| label_dict | Structure to set up the operator and the threshold for checking the coverage of a certain class | +| type | Operator type \["==", ">=", "\<="\] | +| threshold | Coverage threshold for the individual class | diff --git a/configs/mco_256.json b/configs/mco_256.json index 07c6946..923710d 100644 --- a/configs/mco_256.json +++ b/configs/mco_256.json @@ -40,4 +40,4 @@ "annotated": true } } -} \ No newline at end of file +} diff --git a/configs/mco_512.json b/configs/mco_512.json index e7e0ced..e503641 100644 --- a/configs/mco_512.json +++ b/configs/mco_512.json @@ -40,4 +40,4 @@ "annotated": true } } -} \ No newline at end of file +} diff --git a/configs/tcga-crc_256.json b/configs/tcga-crc_256.json index 535ed84..acc6edb 100644 --- a/configs/tcga-crc_256.json +++ b/configs/tcga-crc_256.json @@ -40,4 +40,4 @@ "annotated": true } } -} \ No newline at end of file +} diff --git a/configs/tcga-crc_512.json b/configs/tcga-crc_512.json index b67e8a8..ea9c0eb 100644 --- a/configs/tcga-crc_512.json +++ b/configs/tcga-crc_512.json @@ -40,4 +40,4 @@ "annotated": true } } -} \ No newline at end of file +} diff --git a/count_tiles.py b/count_tiles.py index de3e39b..a7ca4e3 100644 --- a/count_tiles.py +++ b/count_tiles.py @@ -1,7 +1,7 @@ +import json import os -from pathlib import Path from argparse import ArgumentParser -import json +from pathlib import Path script_dir = os.path.dirname(os.path.realpath(__file__)) @@ -21,11 +21,11 @@ def main(config_path): n_other = len(os.listdir(os.path.join(slide_path, "non_tumor"))) n_total = n_tumor + n_other frac = n_tumor / n_total * 100 - print("{}: {}/{}, {:.4f}% tumor tiles of total tiles, name: {}".format(i, n_tumor, n_total, frac, slide)) + print(f"{i}: {n_tumor}/{n_total}, {frac:.4f}% tumor tiles of total tiles, name: {slide}") total_tumor += n_tumor total_tiles += n_total frac = total_tumor / total_tiles * 100 - print("Total: {}/{}, {:.4f}% tumor tiles of total tiles".format(total_tumor, total_tiles, frac)) + print(f"Total: {total_tumor}/{total_tiles}, {frac:.4f}% tumor tiles of total tiles") if __name__ == "__main__": diff --git a/environment.yml b/environment.yaml similarity index 90% rename from environment.yml rename to environment.yaml index d3a3b78..392367d 100644 --- a/environment.yml +++ b/environment.yaml @@ -7,11 +7,11 @@ dependencies: - python=3.9.* - openslide=3.4.1 # C-Library, latest release from 2015 - openslide-python=1.2.0 # Python binding, release from 2022 + - pre-commit=3.* - pip: - - black==23.3.* - opencv-python==4.5.5.62 - Pillow==9.5.0 - pandas==1.5.2 # Before switching from Numpy to Apache Arrow backend - numpy==1.24.4 - matplotlib==3.7.* - - tqdm==4.66.1 \ No newline at end of file + - tqdm==4.66.1 diff --git a/tile_generator.py b/tile_generator.py index a5b9e09..4275bf7 100644 --- a/tile_generator.py +++ b/tile_generator.py @@ -1,24 +1,20 @@ # System +import json +import multiprocessing import os -from pathlib import Path import shutil -import zipfile - -# Advanced -import multiprocessing import xml.etree.ElementTree as ET -import json -from tqdm import tqdm +import zipfile +from pathlib import Path -# Image Processing -import openslide -from PIL import Image +# Libraries import cv2 - -# Data -import pandas as pd -import numpy as np import matplotlib.pyplot as plt +import numpy as np +import openslide +import pandas as pd +from PIL import Image +from tqdm import tqdm # Custom import tissue_detection @@ -474,7 +470,7 @@ def extract_patches( if self.annotated_only and not tile_dict[tile_key]["annotated"]: pass else: - # ToDo: rows and cols arent calculated correctly, instead a quick fix by using breaks was applied + # TODO: rows and cols aren't calculated correctly, instead a quick fix by using breaks was applied tile_x = tile_dict[tile_key]["x"] * scaling_factor tile_y = tile_dict[tile_key]["y"] * scaling_factor @@ -666,7 +662,7 @@ def save_thumbnail(self, mask, slide_name, level, output_format="png", save_mask plt.imsave(file_name, img, format=output_format) def init_generic_tiff(self): - unit_dict = {"milimeter": 1000, "centimeter": 10000, "meter": 1000000} + unit_dict = {"millimeter": 1000, "centimeter": 10000, "meter": 1000000} scanner = "generic-tiff" assert self.slide.properties["tiff.ResolutionUnit"] in unit_dict.keys(), ( diff --git a/tissue_detection.py b/tissue_detection.py index fa55403..f7c11f0 100644 --- a/tissue_detection.py +++ b/tissue_detection.py @@ -1,11 +1,11 @@ -import numpy as np -import matplotlib.pyplot as plt import copy + import cv2 +import matplotlib.pyplot as plt +import numpy as np def tissue_detection(img, remove_top_border: bool = False): - kernel_size = 3 # remove alpha channel