Implemented Similarity Measures (#15)

* added self-similarity * sequence similarity implementations * versioned the environments * remove not needed envs * added pathogen-derived antigens list * containerize * get this to work * changes in versioning * refactor fusion input * exitron input missing * hla typing requires RNA, DNA or custom (but not both) - list required * added correct snakemake way to include external parameters from config * added exitron input routine - missing from last test * fixed wrong input routine * outsourced variant effects in prioritization * separated variants input and add quotes to ensure emptylist works properly * BLOSUM62-2 matrix for sequence similarity * added release version and missing identation * added release option * adjusted annotation path * added reference to test config
ylab-hi · Mar 12, 2024 · 9b6dcb8 · 9b6dcb8
1 parent 2de4992
commit 9b6dcb8
Show file tree

Hide file tree

Showing 40 changed files with 9,562 additions and 916 deletions.
diff --git a/.github/workflows/containerize.yml b/.github/workflows/containerize.yml
@@ -1,9 +1,6 @@
 name: Containerize
 run-name: ${{ github.actor }} is creating a Docker container for ScanNeo2
-on: 
-  push:
-    branches:
-      - master
+on: [push]
 jobs:
   containerize:
     runs-on: ubuntu-latest

diff --git a/.tests/integration/config_basic/config.yaml b/.tests/integration/config_basic/config.yaml
@@ -1,4 +1,8 @@
-### General
+# reference
+reference:
+  release: 111
+
+### General Settings
 threads: 30
 mapq: 30  # overall required mapping quality
 basequal: 20  # overall required base quality 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.0] - 2024-03-07
+
+### Features
+
+- Added sequence similarity filter for MHC-I
+    - self-similarity (using kernel similarity)
+    - pathogen similarity (BLAST against pathogen-derived epitopes from IEDB)
+    - proteome similarity (BLAST against human proteome)
+- Prioritization of neoantigens is now done separately for each variant type (speeds up the process)
+- NMD information (e.g., escape rule,...) is now also calculated for all variants
+
 ## [0.2.2] - 2024-03-01
 
 ### Fix 

diff --git a/config/config.yaml b/config/config.yaml
@@ -1,4 +1,8 @@
-### General
+### Reference
+reference:
+  release: 111
+
+# General settings
 threads: 30
 mapq: 30  # overall required mapping quality
 basequal: 20  # overall required base quality 
@@ -81,8 +85,8 @@ quantification:
 hlatyping:
   class: BOTH # I, II or BOTH
   # specific path for class II hlatyping (only required when class: II, or BOTH)
-  MHC-I_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
-  MHC-II_mode: BOTH # DNA, RNA, or BOTH (if empty alleles have to be specified in custom)
+  MHC-I_mode: DNA, RNA # DNA, RNA, or custom (if empty alleles have to be specified in custom)
+  MHC-II_mode: DNA, RNA # DNA, RNA, or custom (if empty alleles have to be specified in custom)
 
   # specific path for class II hlatyping (only required when class: II, or BOTH)
   freqdata: ./hlahd_files/freq_data/ 

diff --git a/workflow/envs/basic.yml b/workflow/envs/basic.yml
@@ -6,7 +6,7 @@ dependencies:
   - bwa=0.7.17
   - samtools=1.16.1
   - configargparse=1.7
-  - pyfaidx
+  - pyfaidx=0.8.1.1
   - biopython=1.78
-  - gffutils
-  - pysam
+  - gffutils=0.12
+  - pysam=0.22.0
diff --git a/workflow/envs/gatk.yml b/workflow/envs/gatk.yml
@@ -1,5 +1,4 @@
 channels:
   - bioconda
 dependencies:
-  - gatk4
-
+  - gatk4=4.5.0.0
diff --git a/workflow/envs/hlahd.yml b/workflow/envs/hlahd.yml
@@ -3,4 +3,4 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - bowtie
+  - bowtie=1.3.1
diff --git a/workflow/envs/optitype.yml b/workflow/envs/optitype.yml
@@ -3,6 +3,6 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - optitype =1.3.5
-  - pysam
-  - samtools
+  - optitype=1.3.5
+  - pysam=0.22.0
+  - samtools=1.19.2
diff --git a/workflow/envs/prioritization.yml b/workflow/envs/prioritization.yml
@@ -3,11 +3,14 @@ channels:
  - conda-forge
  - anaconda
 dependencies:
- - python>=3.6
- - vcfpy
- - pyfaidx
- - configargparse
- - pandas
- - pip
+ - python=3.7.12
+ - vcfpy=0.13.6
+ - pyfaidx=0.7.0
+ - configargparse=1.7
+ - pandas=1.3.5
+ - blast=2.15.0
+ - zstd=1.5.5
+ - pip=24.0
  - pip:
-   - gffutils
+   - gffutils==0.12
+   - blosum==2.0.2
diff --git a/workflow/envs/readgroups.yml b/workflow/envs/readgroups.yml
diff --git a/workflow/envs/samtools.yml b/workflow/envs/samtools.yml
@@ -1,6 +1,6 @@
 channels:
   - bioconda
 dependencies:
- - samtools
- - bcftools
+ - samtools=1.9
+ - bcftools=1.9
 
diff --git a/workflow/envs/scanexitron.yml b/workflow/envs/scanexitron.yml
diff --git a/workflow/envs/slippage_removal.yml b/workflow/envs/slippage_removal.yml
diff --git a/workflow/envs/spladder.yml b/workflow/envs/spladder.yml
@@ -2,8 +2,7 @@ channels:
  - anaconda
  - bioconda
 dependencies:
- - python=3.6
- - pysam
+ - python=3.8
  - pip
  - pip:
    - spladder==3.0.4
diff --git a/workflow/envs/star.yml b/workflow/envs/star.yml
diff --git a/workflow/envs/transindel.yml b/workflow/envs/transindel.yml
@@ -4,10 +4,10 @@ channels:
   - defaults
   - r
 dependencies:
-  - samtools
-  - pysam
-  - htseq
-  - pyfaidx
-  - pip
+  - samtools=1.9
+  - pysam=0.15.3
+  - htseq=2.0.3
+  - pyfaidx=0.7.0
+  - pip=24.0
   - pip:
     - vcfpy==0.13.6
diff --git a/workflow/envs/yara.yml b/workflow/envs/yara.yml
@@ -1,5 +1,5 @@
 channels:
   - bioconda
 dependencies:
-  - yara
-  - samtools
+  - yara=1.0.2
+  - samtools=1.9
diff --git a/workflow/rules/altsplicing.smk b/workflow/rules/altsplicing.smk
@@ -9,18 +9,21 @@ rule spladder:
     log:
         "logs/{sample}/spladder/{group}_build.log"
     params:
-      confidence = "--confidence {config[altsplicing][confidence]}",
-      iteration = "--iterations {config[altsplicing][iterations]}",
-      edgelimit = "--ase-edge-limit {config[altsplicing][edgelimit]}",
+      confidence=f"""--confidence {config["altsplicing"]["confidence"]}""",
+      iteration=f"""--iterations {config["altsplicing"]["iterations"]}""",
+      edgelimit=f"""--ase-edge-limit {config["altsplicing"]["edgelimit"]}"""
     shell:
         """
           spladder build -b {input.bam} \
               -a resources/refs/genome.gtf \
               -o {output} --filter-overlap-exons \
               --no-primary-only --quantify-graph \
+              {params.confidence} \
+              {params.iteration} \
+              {params.edgelimit} \
               --qmode all > {log} 2>&1
         """
-
+        
 rule splicing_to_vcf:
   input:
     "results/{sample}/rnaseq/altsplicing/spladder/{group}"

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
@@ -7,25 +7,31 @@ rule download_vep_plugins:
     "logs/vep/download_plugins.log"
   conda:
     "../envs/basic.yml"
+  params:
+    release=f"""{config['reference']['release']}"""
   shell:
     """
       mkdir -p resources/vep/plugins/
-      curl -L -o {output}/NMD.pm https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/110/NMD.pm
-      curl -L -o {output}/Downstream.pm https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/110/Downstream.pm
+      curl -L -o {output}/NMD.pm https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/{params.release}/NMD.pm
+      curl -L -o {output}/Downstream.pm https://raw.githubusercontent.com/Ensembl/VEP_plugins/release/{params.release}/Downstream.pm
       curl -L -o {output}/Wildtype.pm https://raw.githubusercontent.com/griffithlab/pVAC-Seq/master/pvacseq/VEP_plugins/Wildtype.pm
     """
 
 rule download_vep_cache:
   output:
     directory("resources/vep/cache")
-  conda:
-    "../envs/basic.yml"
+  message:
+    "Downloading VEP cache"
   log:
     "logs/vep/cache.log"
+  conda:
+    "../envs/basic.yml"
+  params:
+    release=f"""{config['reference']['release']}"""
   shell:
     """
       mkdir -p {output}
-      curl -L https://g-a8b222.dd271.03c0.data.globus.org/ensemblorg/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_GRCh38.tar.gz \
+      curl -L https://g-a8b222.dd271.03c0.data.globus.org/ensemblorg/pub/release-{params.release}/variation/indexed_vep_cache/homo_sapiens_vep_{params.release}_GRCh38.tar.gz \
       | tar -xz -C resources/vep/cache
     """
 #      curl -L https://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_GRCh38.tar.gz \

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -684,49 +684,88 @@ def get_altsplicing(wildcards):
 
 
 ########### NEOANTIGEN PRIORIZATION ##########
-def get_variants(wildcards):
-    variants = []
-    # indels
-    if config["indel"]["activate"]:
-      if config["indel"]["type"] in ["long", "all"]:
-        variants += expand("results/{sample}/annotation/long.indels.vcf",
-                           sample=config["data"]["name"])
-      if config["indel"]["type"] in ["short", "all"]:
-        variants += expand("results/{sample}/annotation/somatic.short.indels.vcf",
-                           sample=config["data"]["name"])
-        variants += expand("results/{sample}/annotation/somatic.snvs.vcf",
-                           sample=config["data"]["name"])
-
-    # alternative splicing
-    if config["altsplicing"]["activate"]:
-      variants += expand("results/{sample}/annotation/altsplicing.vcf",
-                         sample=config["data"]["name"])
+def get_prioritization_snvs(wildcards):
+  snv = []
+  if config["indel"]["activate"]:
+    if config["indel"]["type"] in ["short", "all"]:
+      snv += expand("results/{sample}/annotation/somatic.snvs.vcf",
+                    sample=config["data"]["name"])
+
+    if len(snv) == 0:
+      print(f"Could not detect any SNVs. Please check the config file")
+      sys.exit(1)
+
+  return snv
 
-    # exitron
-    if config["exitronsplicing"]["activate"]:
-      variants += expand("results/{sample}/annotation/exitrons.vcf",
+def get_prioritization_indels(wildcards):
+  indels = []
+  if config["indel"]["activate"]:
+    if config["indel"]["type"] in ["short", "all"]:
+      indels += expand("results/{sample}/annotation/somatic.short.indels.vcf",
                          sample=config["data"]["name"])
+
+    if len(indels) == 0:
+      print(f"Could not detect any indels. Please check the config file")
+      sys.exit(1)
+
+  return indels
+
+def get_prioritization_long_indels(wildcards):
+  long_indels = []
+  if config["indel"]["activate"]:
+    if config["indel"]["type"] in ["long", "all"]:
+      long_indels += expand("results/{sample}/annotation/long.indels.vcf",
+                            sample=config["data"]["name"])
+
+    if len(long_indels) == 0:
+      print(f"Could not detect any long indels. Please check the config file")
+      sys.exit(1)
+
+  return long_indels
 
-    # custom variants
-    if config["data"]["custom"]["variants"] is not None:
-      variants += expand("results/{sample}/annotation/custom.vcf",
+def get_prioritization_exitrons(wildcards):
+  exitrons = []
+  if config["exitronsplicing"]["activate"]:
+    exitrons += expand("results/{sample}/annotation/exitrons.vcf",
                        sample=config["data"]["name"])
+
+    if len(exitrons) == 0:
+      print(f"Could not detect any exitrons. Please check the config file")
+      sys.exit(1)
+
+  return exitrons
 
-    if len(variants) == 0:
+
+def get_prioritization_altsplicing(wildcards):
+  altsplicing = []
+  if config["altsplicing"]["activate"]:
+    altsplicing += expand("results/{sample}/annotation/altsplicing.vcf",
+                          sample=config["data"]["name"])
+
+    if len(altsplicing) == 0:
       print(f"Could not detect any variants. Please check the config file")
       sys.exit(1)
 
-    return variants
+  return altsplicing
+
+def get_prioritization_custom(wildcards):
+  custom = []
+  if config["data"]["custom"]["variants"] is not None:
+    custom += expand("results/{sample}/annotation/custom.vcf",
+                       sample=config["data"]["name"])
+
+  return custom
+
 
-def get_mhcI(wildcards):
+def get_prioritization_mhcI(wildcards):
   alleles = []
   if config['prioritization']['class'] in ['I', 'BOTH']:
     alleles += expand("results/{sample}/hla/mhc-I.tsv",
                       sample=config['data']['name'])
 
   return alleles
 
-def get_mhcII(wildcards):
+def get_prioritization_mhcII(wildcards):
   alleles = []
   if config['prioritization']['class'] in ['II', 'BOTH']:
     alleles += expand("results/{sample}/hla/mhc-II.tsv",