diff --git a/cutseq/run.py b/cutseq/run.py index 8c70d74..fcd546e 100644 --- a/cutseq/run.py +++ b/cutseq/run.py @@ -126,11 +126,12 @@ def reverse_complement(b): def remove_fq_suffix(f): - suffixes_base = ["_R1_001", "_R2_001", "_R1", "_R2", ""] suffixes = [ - y + "." + x for x in ["fastq.gz", "fq.gz", "fastq", "fq"] for y in suffixes_base + f"{base}.{ext}" + for ext in ["fastq.gz", "fq.gz", "fastq", "fq"] + for base in ["_R1_001", "_R2_001", "_R1", "_R2", ""] ] - print(suffixes) + for suffix in suffixes: if f.endswith(suffix): return f.removesuffix(suffix) diff --git a/docs/_config.yml b/docs/_config.yml index 2d95162..3c8786f 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -15,3 +15,8 @@ callouts: plugins: - jekyll-spaceship + - jekyll-optional-front-matter + +nav: + - Home: index.md + - Adapter Schemes: adapters.md diff --git a/docs/adapters.md b/docs/adapters.md new file mode 100644 index 0000000..5f73096 --- /dev/null +++ b/docs/adapters.md @@ -0,0 +1,259 @@ +# Adapter Schemes + +CutSeq supports various built-in adapter schemes for different NGS library preparation methods. Each scheme follows a general pattern: + +## Components + +- **p5/p7**: Illumina sequencing adapters (shown in light green) +- **inline5/3**: Fixed DNA barcode sequences in brackets () (shown in yellow) +- **umi5/3**: Random UMI sequences marked as N (shown in orange) +- **mask5/3**: Sequences to be masked marked as X (shown in gray) +- **strand**: Direction indicator (>, <, or -) (shown with red arrow) + +## Built-in Schemes + +### DSLIGATION (dsDNA Ligation) +
+
+ AGTTCTACAGTCCGACGATCT + > + AGATCGGAAGAGCACACGTC +
+
+ +- Basic dsDNA ligation with A-tailing +- Forward orientation +- No UMIs or special trimming needed + +### SMALLRNA (Small RNA Libraries) +
+
+ CACGACGCTCTTCCGATCT + > + AGATCGGAAGAGCACACGTC +
+
+ +- Used for small RNA sequencing +- Double ligation method +- Forward orientation +- Optional 2nt trimming on both ends for quality + +### INLINE (Custom Barcoded Libraries) +
+
+ AGTTCTACAGTCCGACGATC + NNNNN + > + NNNNN + (ATCACG) + AGATCGGAAGAGCACACGTC +
+
+ +- Used for libraries with inline barcodes +- Dual UMI design (5nt each) +- Forward orientation +- Contains fixed barcode sequence + +### TAKARAV2 (SMARTerĀ® Stranded Protocol V2) +
+
+ ACACGACGCTCTTCCGATCT + XXX + < + XXX + AGATCGGAAGAGCACACGTC +
+
+ +- Earlier version of TAKARA stranded protocol +- Includes masking for template switching artifacts +- Reverse orientation to RNA +- No UMI sequences + +### STRANDED (Generic Stranded RNA-seq) +
+
+ ACACGACGCTCTTCCGATCT + X + < + XXX + AGATCGGAAGAGCACACGTC +
+
+ +- Basic stranded RNA-seq protocol +- Minimal masking for ligation artifacts +- Reverse orientation +- No UMI sequences + +### TAKARAV3 (SMARTerĀ® Stranded Total RNA-Seq Kit v3) +
+
+ ACACGACGCTCTTCCGATCT + XXX + < + XXXXXX + NNNNNNNN + AGATCGGAAGAGCACACGTC +
+
+ +- Used for stranded RNA-seq +- Contains 8nt UMI +- Reverse orientation to RNA +- Includes masking for template switching artifacts + +### ECLIP6 (eCLIP Protocol) +
+
+ ACACGACGCTCTTCCGATCT + XX + < + X + NNNNNN + AGATCGGAAGAGCACACGTC +
+
+ +- Used for eCLIP and similar protocols +- Contains 6nt UMI +- Reverse orientation +- Short masking regions + +### ECLIP10 (Extended eCLIP Protocol) +
+
+ ACACGACGCTCTTCCGATCT + XX + < + X + NNNNNNNNNN + AGATCGGAAGAGCACACGTC +
+
+ +- Extended version of eCLIP protocol +- Contains 10nt UMI for higher complexity +- Reverse orientation +- Short masking regions + +### SACSEQV3 (SAC-seq Protocol V3) +
+
+ AGTTCTACAGTCCGACGATCT + NNNNNNNN + X + > + XX + NNNNNNNN + AGATCGGAAGAGCACACGTC +
+
+ +- Dual UMI design (8nt each) +- Forward orientation +- Balanced masking on both sides +- Used for high-complexity libraries + +### XGENRNA (xGen RNA Library Prep) +
+
+ ACACGACGCTCTTCCGATCT + XXXXXX + < + XXXXXXXXXXXXXXX + AGATCGGAAGAGCACACGTC +
+
+ +- Handles polyC/G artifacts from random RT priming +- Extended masking for adaptase tail (up to 15bp) +- Reverse orientation +- Uses random polyC tail as pseudo-UMI + +### XGENMETHY (xGen Methyl-Seq) +
+
+ ACACGACGCTCTTCCGATCT + XX + > + XXXXXXXXXX + AGATCGGAAGAGCACACGTC +
+
+ +- Designed for methylation sequencing +- Trims 10 bases from read ends +- Forward orientation +- Includes random primer artifact removal + +### XGENSNMC (snmC-seq Protocol) +
+
+ ACACGACGCTCTTCCGATCT + XXXXXX + > + XXXXXXXXXXXXXXX + AGATCGGAAGAGCACACGTC +
+
+ +- Specialized for single-nucleus methylome sequencing +- Extended 15-base trimming +- Forward orientation +- Heavy masking for protocol artifacts + +### PBAT (Post-Bisulfite Adapter Tagging) +
+
+ ACACGACGCTCTTCCGATCT + XXXXXX + < + XXXXXX + AGATCGGAAGAGCACACGTC +
+
+ +- Used for post-bisulfite DNA sequencing +- Random primer-based adapter addition +- Reverse orientation +- Symmetric masking for random tails + +### NEXTERA (ATAC-seq) +
+
+ AGATGTGTATAAGAGACAG + > + CTGTCTCTTATACACATCT +
+
+ +- Used for ATAC-seq libraries +- Simple design without UMIs or barcodes +- Forward orientation +- Standard Nextera adapters + +### ILLUMINARNA (Illumina Stranded RNA-Seq) +
+
+ AGATGTGTATAAGAGACAG + < + CTGTCTCTTATACACATCT +
+
+ +- Standard Illumina stranded RNA-seq protocol +- Reverse orientation +- Simple design without UMIs or masking +- Direct adapter ligation method + + \ No newline at end of file