Skip to content

merge with reorganized repo #526

merge with reorganized repo

merge with reorganized repo #526

Workflow file for this run

# This workflow is used for testing whether the DEG list produced by the new code is identical to the expected result based on example data.
name: Tests
on:
push:
branches:
- '*'
schedule:
- cron: '0 0 1 * *'
env:
CACHE_NUMBER: 0
permissions:
contents: read
jobs:
build:
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
strategy:
matrix:
include:
- os: ubuntu-20.04
label: linux-64
prefix: /usr/share/miniconda3/envs/base
r-version: ['4.3.1']
python-version: ['3.7']
seq_type: ['temposeq'] # add later... , 'rnaseq']
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v3
- name: Install linux dependencies
run: |
sudo apt-get update
sudo apt-get install libcairo2-dev
sudo apt-get install libxt-dev
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Setup Mambaforge
uses: conda-incubator/setup-miniconda@v2
with:
miniforge-variant: Mambaforge
miniforge-version: latest
environment-file: workflow/envs/base.yml
activate-environment: base
use-mamba: true
python-version: ${{ matrix.python-version }}
mamba-version: "*"
auto-activate-base: true
auto-update-conda: true
use-only-tar-bz2: true # This needs to be set for caching to work properly, according to others
# - name: Set cache date
# run: echo "DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV
# - uses: actions/cache@v3
# with:
# path: ${{ matrix.prefix }}
# key: ${{ matrix.label }}-conda-${{ hashFiles('workflow/envs/base.yml') }}-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
# id: cache
- name: Update environment
run: mamba env update -n base -f workflow/envs/base.yml
if: steps.cache.outputs.cache-hit != 'true'
- name: Check conda install
run: |
conda info
conda list
shell: bash -l {0}
- name: Import data
run: |
ls -alht # What is in the "working directory"?
git clone https://github.com/EHSRB-BSRSE-Bioinformatics/test-data
rm -r inputs # Remove existing directory before replacing w/ test data
mv test-data/${{ matrix.seq_type}}/* ./
wget https://github.com/EHSRB-BSRSE-Bioinformatics/unify_temposeq_manifests/raw/main/output_manifests/Human_S1500_1.2_standardized.csv
- name: Build snakemake environment
run: |
snakemake --cores 8 --use-conda --conda-create-envs-only
- name: Install extra dependencies
run: |
conda run -p $(grep -rl "R-ODAF_reports" .snakemake/conda/*.yaml | \
sed s/\.yaml//) Rscript install.R
- name: Run workflow
run: |
snakemake --cores 8 --use-conda
- name: Generate md5sums
run: |
md5sum output/processed/count_table.tsv > checksum.count_table
md5sum inputs/metadata/metadata.QC_applied.txt > checksum.metadata.QC_applied
md5sum output/QC/MultiQC_Report_data.zip > checksum.multiQC
md5sum output/QC/details/QC_per_sample.txt > checksum.QC_per_sample
md5sum output/analysis/analysis_default_????????-????/DEG_lists/BaP/*.txt > checksums.BaP_DEGs #wildcards for timestamped folder
md5sum output/analysis/analysis_default_????????-????/DEG_lists/CISP/*.txt > checksums.CISP_DEGs #wildcards for timestamped folder
# md5sum output/analysis/analysis_default_????????-????/pathway_analysis/BaP/*WikiPathways* > checksums.BaP_pathways
# md5sum output/analysis/analysis_default_????????-????/pathway_analysis/CISP/* > checksums.CISP_pathways
md5sum output/analysis/analysis_default_????????-????/BMD_and_biomarker_files/*.txt > checksums.BMD_files
md5sum output/analysis/analysis_default_????????-????/BMD_and_biomarker_files/*/*.txt >> checksums.BMD_files
- name: Compare pre-processing and QC files to truth set
run: |
cmp truth_checksums/checksum.count_table checksum.count_table
cmp truth_checksums/checksum.metadata.QC_applied checksum.metadata.QC_applied
#cmp truth_checksums/checksum.multiQC checksum.multiQC
cmp truth_checksums/checksum.QC_per_sample checksum.QC_per_sample
- name: Print out checksums for DEG lists
run: |
echo "BaP DEG checksums from this run:"
cat checksums.BaP_DEGs # Can be informative if there are errors
echo "BaP DEG checksums from 'truth' set:"
cat truth_checksums/checksums.BaP_DEGs # Can be informative if there are errors
# There are rounding differences between different environments, e.g.:
# cat analysis/DEG_lists/BaP/*_significant.txt
echo "CISP DEG checksums from this run:"
cat checksums.CISP_DEGs
echo "CISP DEG checksums from 'truth' set:"
cat truth_checksums/checksums.CISP_DEGs # Can be informative if there are errors
# echo "BaP pathway checksums from this run:"
# cat checksums.BaP_pathways
# echo "BaP pathway checksums from 'truth' set:"
# cat truth_checksums/checksums.BaP_pathways
- name: Compare DEG lists to truth set
run: |
# Only check first column, because of timestamps in filenames
# cmp <(cat truth_checksums/checksums.BaP_DEGs | awk {'print $1'} | sort) \
# <(cat checksums.BaP_DEGs | awk {'print $1'} | sort)
# cmp <(cat truth_checksums/checksums.CISP_DEGs | awk {'print $1'} | sort) \
# <(cat checksums.CISP_DEGs | awk {'print $1'} | sort)
# Instead of cmp, this code will check if ALL the checksums from the test have a corresponding match in the larger set of possible "truth" checksums.
# This is a hack, but it should work to deal with rounding issues.
diff -q <(sort -u checksums.BaP_DEGs | awk {'print $1'}) \
<(grep -Fxf \
<(cat checksums.BaP_DEGs | awk {'print $1'}) \
<(cat truth_checksums/checksums.BaP_DEGs | awk {'print $1'} | sort -u))
diff -q <(sort -u checksums.CISP_DEGs | awk {'print $1'}) \
<(grep -Fxf \
<(cat checksums.CISP_DEGs | awk {'print $1'}) \
<(cat truth_checksums/checksums.CISP_DEGs | awk {'print $1'} | sort -u))
# - name: Compare pathway analysis to truth set
# run: |
# cmp truth_checksums/checksums.BaP_pathways checksums.BaP_pathways
- name: Compare BMD input files to truth set
run: |
cat checksums.BMD_files # Can be informative if there are errors
cat truth_checksums/checksums.BMD_files # Can be informative if there are errors
cmp <(sort -u checksums.BMD_files | awk {'print $1'}) <(sort -u truth_checksums/checksums.BMD_files | awk {'print $1'} )