greenelab · miltondp · Aug 13, 2021 · Aug 9, 2021 · Aug 9, 2021 · Aug 9, 2021
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -0,0 +1,36 @@
+name: lint
+on:
+  push:
+  pull_request:
+    types: [opened, reopened]
+jobs:
+  run-linters:
+    name: Run linters
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out Git repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.9
+
+      - name: Install Python dependencies
+        run: pip install black flake8
+
+      - name: Run linters
+        uses: wearerequired/lint-action@v1
+        with:
+          github_token: ${{ secrets.github_token }}
+          # Enable linters
+          black: true
+          flake8: true
+          # Mark the following line true if you want linters to attempt to
+          # autocorrect your code
+          auto_fix: true
+          git_name: "Greene Lab Linter"
+          git_email: "[email protected]"
+          commit_message: "fix code style issues with ${linter}"
+
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
@@ -0,0 +1,85 @@
+name: tests
+on:
+  push:
+  pull_request:
+    types: [opened, reopened]
+
+env:
+  # Increase this value to reset cache if environment.yml has not changed.
+  PY_CACHE_NUMBER: 2
+  PY_ENV: cm_gene_expr
+
+jobs:
+  pytest:
+    name: Python tests
+    runs-on: ${{ matrix.os }}
+    strategy:
+      max-parallel: 4
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest, macOS-latest, windows-latest]
+    steps:
+      - name: Checkout git repo
+        uses: actions/checkout@v2
+        with:
+          lfs: false
+      - name: Cache conda
+        id: cache
+        uses: actions/cache@v2
+        with:
+          path: "${{ env.PY_ENV }}.tar.gz"
+          key: ${{ runner.os }}-${{ env.PY_CACHE_NUMBER }}-${{ hashFiles('environment/environment.yml') }}
+      - name: Setup Miniconda
+        if: steps.cache.outputs.cache-hit != 'true'
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          miniconda-version: "latest"
+          auto-update-conda: true
+          activate-environment: ${{ env.PY_ENV }}
+          channel-priority: strict
+          environment-file: environment/environment.yml
+          auto-activate-base: false
+      - name: Conda-Pack
+        if: steps.cache.outputs.cache-hit != 'true'
+        shell: bash -l {0}
+        run: |
+          conda install --yes -c conda-forge conda-pack coverage
+          conda pack -f -n ${{ env.PY_ENV }} -o "${{ env.PY_ENV }}.tar.gz"
+      - name: Unpack environment
+        shell: bash -l {0}
+        run: |
+          mkdir -p "${{ env.PY_ENV }}"
+          tar -xzf "${{ env.PY_ENV }}.tar.gz" -C "${{ env.PY_ENV }}"
+      - name: Setup data and run pytest (Windows systems)
+        if: runner.os == 'Windows'
+        env:
+          PYTHONPATH: libs/
+        run: |
+          ${{ env.PY_ENV }}/python environment/scripts/setup_data.py --mode testing
+          ${{ env.PY_ENV }}/python -m pytest -v -rs tests
+      - name: Setup data and run pytest (non-Windows systems)
+        if: runner.os != 'Windows'
+        shell: bash
+        env:
+          PYTHONPATH: libs/
+        run: |
+          source ${{ env.PY_ENV }}/bin/activate
+          conda-unpack
+
+          python environment/scripts/setup_data.py --mode testing
+
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            coverage run --source=libs/ -m pytest -v -rs tests
+            coverage xml -o coverage.xml
+          else
+            pytest -v -rs tests
+          fi
+      - name: Codecov upload
+        if: runner.os == 'Linux'
+        uses: codecov/codecov-action@v2
+        with:
+          files: ./coverage.xml
+          name: codecov-${{ matrix.os }}-python${{ matrix.python-version }}
+          fail_ci_if_error: true
+          verbose: true
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,42 @@
+FROM continuumio/miniconda3
+
+EXPOSE 8893/tcp
+
+ENV CODE_DIR=/opt/code
+ENV CM_CONDA_ENV_NAME="clustermatch_gene_expr"
+ENV CM_N_JOBS=1
+ENV CM_ROOT_DIR=/opt/data
+ENV CM_MANUSCRIPT_DIR=/opt/manuscript
+
+VOLUME ${CM_ROOT_DIR}
+VOLUME ${CM_MANUSCRIPT_DIR}
+
+# install gnu parallel
+RUN DEBIAN_FRONTEND=noninteractive apt-get update \
+  && apt-get install -y --no-install-recommends parallel \
+  && apt-get clean \
+  && rm -rf /var/lib/apt/lists/*
+
+# setup phenoplier
+COPY environment/environment.yml environment/scripts/install_other_packages.sh environment/scripts/install_r_packages.r /tmp/
+RUN conda env create --name ${CM_CONDA_ENV_NAME} --file /tmp/environment.yml \
+  && conda run -n ${CM_CONDA_ENV_NAME} --no-capture-output /bin/bash /tmp/install_other_packages.sh \
+  && conda clean --all --yes
+
+# activate the environment when starting bash
+RUN echo "conda activate ${CM_CONDA_ENV_NAME}" >> ~/.bashrc
+SHELL ["/bin/bash", "--login", "-c"]
+
+ENV PYTHONPATH=${CODE_DIR}/libs:${PYTHONPATH}
+
+RUN echo "Make sure packages can be loaded"
+RUN python -c "import papermill"
+
+COPY . ${CODE_DIR}
+WORKDIR ${CODE_DIR}
+
+RUN echo "Make sure modules can be loaded"
+RUN python -c "from clustermatch import conf"
+
+ENTRYPOINT ["/opt/code/entrypoint.sh"]
+CMD ["scripts/run_nbs_server.sh", "--container-mode"]
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,42 @@
+BSD-2-Clause Plus Patent License
+
+Copyright (c) 2020, Contributors & the Greene Laboratory at the University of Pennsylvania
+
+Redistribution and use in source and binary forms, with or without modification, are permitted
+provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions
+   and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+   and the following disclaimer in the documentation and/or other materials provided with the
+   distribution.
+
+Subject to the terms and conditions of this license, each copyright holder and contributor hereby
+grants to those receiving rights under this license a perpetual, worldwide, non-exclusive,
+no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license)
+patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this
+software, where such license applies only to those patent claims, already acquired or hereafter
+acquired, licensable by such copyright holder or contributor that are necessarily infringed by:
+
+(a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable
+    additions of contributors, in source or binary form) alone; or
+
+(b) combination of their Contribution(s) with the work of authorship to which such Contribution(s)
+    was added by such copyright holder or contributor, if, at the time the Contribution is added,
+    such addition causes such combination to be necessarily infringed. The patent license shall not
+    apply to any other combinations which include the Contribution.
+
+Except as expressly stated above, no rights or licenses from any copyright holder or contributor is
+granted under this license, whether expressly, by implication, estoppel or otherwise.
+
+DISCLAIMER
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -1 +1,103 @@
-# clustermatch-gene-expr
+# Clustermatch on gene expression data (code)
+
+[![Code tests](https://github.com/greenelab/clustermatch-gene-expr/actions/workflows/pytest.yaml/badge.svg)](https://github.com/greenelab/clustermatch-gene-expr/actions/workflows/pytest.yaml)
+[![codecov](https://codecov.io/gh/greenelab/clustermatch-gene-expr/branch/main/graph/badge.svg?token=QNK6O3Y1VF)](https://codecov.io/gh/greenelab/clustermatch-gene-expr)
+[![HTML Manuscript](https://img.shields.io/badge/manuscript-HTML-blue.svg)](https://greenelab.github.io/clustermatch-gene-expr-manuscript/)
+[![PDF Manuscript](https://img.shields.io/badge/manuscript-PDF-blue.svg)](https://greenelab.github.io/clustermatch-gene-expr-manuscript/manuscript.pdf)
+
+
+## Overview
+
+![](images/cm_gene_expr_overview.png)
+
+TODO: update description and links to manuscripts
+
+This repository contains the source code to reproduce the analyses of Clustermatch on gene expression data.
+If you want to use Clustermatch as a standalone tool to perform your own analyses, please go to the [official repository](https://github.com/sinc-lab/clustermatch) and follow the installation instructions.
+
+For more details, check out our manuscript in COMPLETE or our [Manubot web version](https://greenelab.github.io/clustermatch-gene-expr-manuscript/).
+
+
+## Setup
+
+To prepare the environment to run the analyses, follow the steps in [environment](environment/).
+This will create a conda environment and download the necessary data.
+Alternatively, you can use our Docker image (see below).
+
+## Running code
+
+### From command-line
+
+First, activate your conda environment and export your settings to environmental variables so non-Python scripts can access them:
+```bash
+conda activate clustermatch_gene_expr
+eval `python libs/conf.py`
+```
+
+The code to preprocess data and generate results is in the `nbs/` folder.
+All notebooks are organized by directories, such as `01_preprocessing`, with file names that indicate the order in which they should be run (if they share the prefix, then it means they can be run in parallel).
+For example, to run all notebooks for the preprocessing step, you can use this command (requires [GNU Parallel](https://www.gnu.org/software/parallel/)):
+
+```bash
+cd nbs/
+parallel -k --lb --halt 2 -j1 'bash run_nbs.sh {}' ::: 01_preprocessing/*.ipynb
+```
+
+<!--
+Or if you want to run all the analyses at once, you can use:
+
+```bash
+shopt -s globstar
+parallel -k --lb --halt 2 -j1 'bash run_nbs.sh {}' ::: nbs/{,**/}*.ipynb
+```
+-->
+
+### From your browser
+
+Alternatively, you can start your JupyterLab server by running:
+
+```bash
+bash scripts/run_nbs_server.sh
+```
+
+Then, go to `http://localhost:8892`, browse the `nbs` folder, and run the notebooks in the specified order.
+
+## Using Docker
+
+You can also run all the steps below using a Docker image instead of a local installation.
+
+```bash
+docker pull miltondp/clustermatch_gene_expr
+```
+
+The image only contains the conda environment with the code in this repository so, after pulling the image, you need to download the data as well:
+
+```bash
+docker run --rm \
+  -v "/tmp/clustermatch_gene_expr_data:/opt/clustermatch_gene_expr_data" \
+  miltondp/clustermatch_gene_expr \
+  python environment/scripts/setup_data.py
+```
+
+The `-v` parameter allows specifying a local directory (`/tmp/clustermatch_gene_expr_data`) where the data will be downloaded.
+If you want to generate the figures and tables for the manuscript, you need to clone the [manuscript repo](https://github.com/greenelab/clustermatch-gene-expr-manuscript) and pass it with `-v [PATH_TO_MANUSCRIPT_REPO]:/opt/clustermatch_gene_expr_manuscript`.
+
+You can run notebooks from the command line, for example:
+
+```bash
+docker run --rm \
+  -v "/tmp/clustermatch_gene_expr_data:/opt/clustermatch_gene_expr_data" \
+  miltondp/clustermatch_gene_expr \
+  /bin/bash -c "parallel -k --lb --halt 2 -j1 'bash nbs/run_nbs.sh {}' ::: nbs/01_preprocessing/*.ipynb"
+```
+
+or start a Jupyter Notebook server with:
+
+```bash
+docker run --rm \
+  -p 8888:8892 \
+  -v "/tmp/clustermatch_gene_expr_data:/opt/clustermatch_gene_expr_data" \
+  miltondp/clustermatch_gene_expr
+```
+
+and access the interface by going to `http://localhost:8888`.
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,9 @@
+#!/bin/bash --login
+# Taken from here with modifications: https://pythonspeed.com/articles/activate-conda-dockerfile/
+# The --login ensures the bash configuration is loaded,
+# enabling Conda.
+set +eu
+conda activate clustermatch_gene_expr
+set -euo pipefail
+
+exec "$@"
diff --git a/environment/README.md b/environment/README.md
@@ -0,0 +1,73 @@
+# Manual conda environment installation and data download
+
+If you want to run the scripts/notebooks, you need to follow these steps to create a conda environment and download the necessary data.
+
+Keep in mind that although unit tests are automatically run on Linux, macOS and MS Windows, the software is manually tested only on Linux/Ubuntu.
+
+1. Install [Miniconda](https://docs.conda.io/en/latest/miniconda.html) or Anaconda.
+
+1. Open a terminal, clone this repository, and `cd` into the repository root folder.
+
+1. Run `cd environment`.
+
+1. (optional) Adjust your environment variables:
+
+    ```bash
+    # (optional, will default to subfolder 'cm_gene_expr' under the system's temporary directory)
+    # Root directory where all data will be downloaded to
+    export CM_ROOT_DIR=/tmp/cm_gene_expr
+
+    # (optional, will default to half the number of cores)
+    # Adjust the number of cores available for general tasks
+    export CM_N_JOBS=2
+
+    # (optional)
+    # Export this variable if you downloaded the manuscript sources and want to
+    # generate the figures for it
+    export CM_MANUSCRIPT_DIR=/tmp/manuscript
+    ```
+
+1. (optional) Adjust other settings (i.e. root directory, available computational
+   resources, etc.) by modifying the file `../libs/clustermatch/settings.py`
+
+1. Adjust your `PYTHONPATH` variable to include the `libs` directory:
+
+    ```bash
+    export PYTHONPATH=`readlink -f ../libs/`:$PYTHONPATH
+    ```
+
+    `readlink` might not work on macOS. In that case, simply replace it with
+    the absolute path to the `../libs/` folder.
+
+1. Run `bash scripts/setup_environment.sh`.
+This will create a conda environment and download the data needed to run the analyses.
+This will download `XXX` GB, so it will take a while to finish.
+
+
+# Developer usage
+
+These steps are only for developers.
+
+1. Modify `scripts/environment_base.yml` accordingly (if needed).
+1. Run:
+
+    ```bash
+    conda env create -n clustermatch_gene_expr -f scripts/environment_base.yml
+    conda activate clustermatch_gene_expr
+    bash scripts/install_other_packages.sh
+    ```
+
+<!-- 
+1. (CHECK!) Install JupyterLab extensions (MIGHT NOT BE NECESSARY IN VERSION 3.0+):
+
+    ```bash
+    jupyter labextension install @jupyterlab/toc
+    ``` -->
+
+1. Export conda environment:
+
+    ```
+    conda env export --name clustermatch_gene_expr --file environment.yml
+    ```
+
+1. Modify `environment.yml` and leave only manually installed packages (not their dependencies).