From ae37437079b0d8ad8cb64ae6ad90f760c83c7c46 Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Thu, 16 Nov 2023 17:03:11 +0100 Subject: [PATCH] simplify config and expand readme --- .test/config/config.yaml | 29 ++++++------------ README.md | 14 +++++++-- config/cattle.yaml | 29 ------------------ config/example.yaml | 58 ++++++++++++++++++++++++++++++++++++ config/pangenie_compare.yaml | 21 ------------- config/pangenie_vcf.yaml | 10 ------- 6 files changed, 78 insertions(+), 83 deletions(-) delete mode 100644 config/cattle.yaml create mode 100644 config/example.yaml delete mode 100644 config/pangenie_compare.yaml delete mode 100644 config/pangenie_vcf.yaml diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 0dc0ce5..5e71c1a 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -24,31 +24,20 @@ frac_missing: .15 ## Pangenome genotyping small_variants: DV.vcf.gz -## Pangenie stuff -#panel: '/cluster/work/pausch/alex/eQTL_GWAS/pangenie_panel.vcfwave.vcf' fastq: '' -HiFi_samples: - sample1: hifi_1.fq.gz - sample2: hifi_2.fq.gz - -#pangenie_out: '/cluster/work/pausch/alex/eQTL_GWAS/pangenie_wave/samples.all.pangenie_genotyping.vcf.gz' +samples: + - sample1 + - sample2 -## Phasing and imputation -#beagle4: '' -beagle5: '' -window: 10000000 -overlap: 20000 +## Variant comparison -samples: - - sample1 - - sample2 +HiFi_samples: + sample1: hifi_1.fq.gz + sample2: hifi_2.fq.gz -## molecular QTL mapping -#variants: -# PanGenie: '/cluster/work/pausch/alex/eQTL_GWAS/pangenie_8_wave/samples.all.pangenie_genotyping_DV.vcf.gz' -# SR: '/cluster/work/pausch/alex/eQTL_GWAS/variants/DV-SR/cohort.autosomes.WGS.imputed.vcf.gz' +## Association testing covariates: eQTL: @@ -63,5 +52,5 @@ mol_QTLs: permutations: 2500 -window: 1000000 #1 Mb +window: 1000000 #1 Mb cis window chunks: 40 diff --git a/README.md b/README.md index 0b49244..4862f26 100644 --- a/README.md +++ b/README.md @@ -13,15 +13,23 @@ There are broadly three phases - Variant analysis (statistics, linkage disequibrium, SV overlap, etc.) - Association mapping of e/sQTL + +An example of the input needed is given in the `config/example.yaml`, broadly requiring + - haplotype-resolved assemblies for pangenome panel creation + - small variants to supplement pangenome panel + - any HiFi samples to test SV completeness + - gene expression/splicing files and covariates for molecular QTL mapping + Running with ``` -snakemake --configfile config/config.yaml +snakemake --configfile config/example.yaml ``` -Will produce execute the following DAG +Will execute the following DAG ![workflow](https://github.com/AnimalGenomicsETH/pangenome_molQTL/assets/29678761/bb0c73ca-fc31-4319-95e2-485da93f655a) -which produces the major output files (e.g., accuracy comparison of PanGenie vs DeepVariant, SV overlap with Jasmine, conditional QTL analysis with QTLtools, etc.), which can then be independently analysed further. +producing the major output files (e.g., accuracy comparison of PanGenie vs DeepVariant, SV overlap with Jasmine, conditional QTL analysis with QTLtools, etc.), which can then be independently analysed further. +Many of these steps are computationally intensive, especially with many samples to genotype, and so effectively require some form of HPC. ### Citation diff --git a/config/cattle.yaml b/config/cattle.yaml deleted file mode 100644 index 899f4a5..0000000 --- a/config/cattle.yaml +++ /dev/null @@ -1,29 +0,0 @@ -chromosomes: 'all' -L: 50 -runs: - #static: '' - #0-5: 'random.sample(idx[:5],k=5)' #all shasta - #5-10: 'random.sample(idx[5:],k=5)' #all hifiasm - #10-20: 'random.sample(idx,k=len(idx))' #randomly shuffled - #20-40: '[i+random.randint(0,1)*5 for i in range(0,5,1)]' #randomly pick hifi/shasta per pair - #300-310: 'random.sample([0,3,4,5,8,9],k=6)' - - 5-10: 'random.sample(idx,k=len(idx))' #random shuffle - -assemblies: - ARS: '/cluster/work/pausch/alex/REF_DATA/ARS-UCD1.2_Btau5.0.1Y.fa' - B_hifiasm: 'assemblies/BSW.hifiasm.fasta' - O_hifiasm: 'assemblies/obv.hifiasm.fasta' - H_hifiasm: 'assemblies/ob1.hifiasm.fasta' - Os1_hifiasm: 'assemblies/ob1.hifiasm.fasta' - Os2_hifiasm: 'assemblies/ob1.hifiasm.fasta' - Od1_hifiasm: 'assemblies/ob1.hifiasm.fasta' - Od2_hifiasm: 'assemblies/ob1.hifiasm.fasta' - B31_hifiasm: 'assemblies/ob1.hifiasm.fasta' - B32_hifiasm: 'assemblies/ob1.hifiasm.fasta' - B41_hifiasm: 'assemblies/ob1.hifiasm.fasta' - B42_hifiasm: 'assemblies/ob1.hifiasm.fasta' - H_clr: 'assemblies/Highland.fa' - A_clr: 'assemblies/Angus.fa' - P_hifiasm: 'assemblies/pied.hifiasm.fasta' - S_ont: 'assemblies/pied.hifiasm.fasta' #manual diff --git a/config/example.yaml b/config/example.yaml new file mode 100644 index 0000000..0a5e1fb --- /dev/null +++ b/config/example.yaml @@ -0,0 +1,58 @@ +## Pangenome panel +reference: '/path/to/reference.fasta' + +assemblies: + asm1: + - 'asm1.hap1.fa + - 'asm1.hap2.fa' + asm2: + - 'asm2.hap1.fa' + - 'asm2.hap1.fa' + asm3: + - 'asm3.hap1.fa' + - 'asm3.hap2.fa' + asm4: + - 'asm4.hap1.fa' + - 'asm4.hap2.fa' +trios: + asm1: + - asm2 + - asm3 + +scripts: '/path/to/PanGenie/scripts' +outdir: 'pangenie_panel/' +frac_missing: .15 + +## Pangenome genotyping +small_variants: DV.vcf.gz + +fastq: '' + +samples: + - sample1 + - sample2 + +## Variant comparison + +HiFi_samples: + sample1: hifi_1.fq.gz + sample2: hifi_2.fq.gz + + +## molecular QTL mapping + +covariates: + eQTL: + Testis: 'eQTL.covar' + sQTL: + Testis: 'sQTL.covar' +mol_QTLs: + eQTL: + Testis: 'eQTL.TPM' + sQTL: + Testis: 'sQTL.clusters' + + +permutations: 2500 +window: 1000000 #1 Mb cis window +chunks: 40 diff --git a/config/pangenie_compare.yaml b/config/pangenie_compare.yaml deleted file mode 100644 index 65bf05e..0000000 --- a/config/pangenie_compare.yaml +++ /dev/null @@ -1,21 +0,0 @@ -samples: - - BSWCHEF120023224572 - - BSWCHEF120069106030 - - BSWCHEF120080761164 - - BSWCHEF120104805454 - - BSWCHEF120114550405 - - BSWCHEF120118254972 - - BSWCHEF120121047356 - - BSWCHEF120124686545 - - BSWCHEM120031901830 - - BSWCHEM120032196501 - - BSWCHEM120057745616 - - BSWCHEM120066947247 - - RM1179 - - RM1896 - - RM1897 - - RM1900 - - RM1902 - - RM2009 - - RM947 - - RM951 diff --git a/config/pangenie_vcf.yaml b/config/pangenie_vcf.yaml deleted file mode 100644 index 0733700..0000000 --- a/config/pangenie_vcf.yaml +++ /dev/null @@ -1,10 +0,0 @@ -reference: '/cluster/work/pausch/alex/REF_DATA/ARS-UCD1.2_Btau5.0.1Y.fa' -assemblies: - OxO: '' - OSire: '' - ODam: '' -trios: - OxO: 'OSire ODam' -scripts: '/cluster/work/pausch/alex/software/vcf-merging/pangenome-graph-from-assemblies/scripts' -outdir: '' -