diff --git a/cutseq/run.py b/cutseq/run.py
index 8c70d74..fcd546e 100644
--- a/cutseq/run.py
+++ b/cutseq/run.py
@@ -126,11 +126,12 @@ def reverse_complement(b):
def remove_fq_suffix(f):
- suffixes_base = ["_R1_001", "_R2_001", "_R1", "_R2", ""]
suffixes = [
- y + "." + x for x in ["fastq.gz", "fq.gz", "fastq", "fq"] for y in suffixes_base
+ f"{base}.{ext}"
+ for ext in ["fastq.gz", "fq.gz", "fastq", "fq"]
+ for base in ["_R1_001", "_R2_001", "_R1", "_R2", ""]
]
- print(suffixes)
+
for suffix in suffixes:
if f.endswith(suffix):
return f.removesuffix(suffix)
diff --git a/docs/_config.yml b/docs/_config.yml
index 2d95162..3c8786f 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -15,3 +15,8 @@ callouts:
plugins:
- jekyll-spaceship
+ - jekyll-optional-front-matter
+
+nav:
+ - Home: index.md
+ - Adapter Schemes: adapters.md
diff --git a/docs/adapters.md b/docs/adapters.md
new file mode 100644
index 0000000..5f73096
--- /dev/null
+++ b/docs/adapters.md
@@ -0,0 +1,259 @@
+# Adapter Schemes
+
+CutSeq supports various built-in adapter schemes for different NGS library preparation methods. Each scheme follows a general pattern:
+
+## Components
+
+- **p5/p7**: Illumina sequencing adapters (shown in light green)
+- **inline5/3**: Fixed DNA barcode sequences in brackets () (shown in yellow)
+- **umi5/3**: Random UMI sequences marked as N (shown in orange)
+- **mask5/3**: Sequences to be masked marked as X (shown in gray)
+- **strand**: Direction indicator (>, <, or -) (shown with red arrow)
+
+## Built-in Schemes
+
+### DSLIGATION (dsDNA Ligation)
+
+
+ AGTTCTACAGTCCGACGATCT
+ >
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Basic dsDNA ligation with A-tailing
+- Forward orientation
+- No UMIs or special trimming needed
+
+### SMALLRNA (Small RNA Libraries)
+
+
+ CACGACGCTCTTCCGATCT
+ >
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Used for small RNA sequencing
+- Double ligation method
+- Forward orientation
+- Optional 2nt trimming on both ends for quality
+
+### INLINE (Custom Barcoded Libraries)
+
+
+ AGTTCTACAGTCCGACGATC
+ NNNNN
+ >
+ NNNNN
+ (ATCACG)
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Used for libraries with inline barcodes
+- Dual UMI design (5nt each)
+- Forward orientation
+- Contains fixed barcode sequence
+
+### TAKARAV2 (SMARTerĀ® Stranded Protocol V2)
+
+
+ ACACGACGCTCTTCCGATCT
+ XXX
+ <
+ XXX
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Earlier version of TAKARA stranded protocol
+- Includes masking for template switching artifacts
+- Reverse orientation to RNA
+- No UMI sequences
+
+### STRANDED (Generic Stranded RNA-seq)
+
+
+ ACACGACGCTCTTCCGATCT
+ X
+ <
+ XXX
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Basic stranded RNA-seq protocol
+- Minimal masking for ligation artifacts
+- Reverse orientation
+- No UMI sequences
+
+### TAKARAV3 (SMARTerĀ® Stranded Total RNA-Seq Kit v3)
+
+
+ ACACGACGCTCTTCCGATCT
+ XXX
+ <
+ XXXXXX
+ NNNNNNNN
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Used for stranded RNA-seq
+- Contains 8nt UMI
+- Reverse orientation to RNA
+- Includes masking for template switching artifacts
+
+### ECLIP6 (eCLIP Protocol)
+
+
+ ACACGACGCTCTTCCGATCT
+ XX
+ <
+ X
+ NNNNNN
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Used for eCLIP and similar protocols
+- Contains 6nt UMI
+- Reverse orientation
+- Short masking regions
+
+### ECLIP10 (Extended eCLIP Protocol)
+
+
+ ACACGACGCTCTTCCGATCT
+ XX
+ <
+ X
+ NNNNNNNNNN
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Extended version of eCLIP protocol
+- Contains 10nt UMI for higher complexity
+- Reverse orientation
+- Short masking regions
+
+### SACSEQV3 (SAC-seq Protocol V3)
+
+
+ AGTTCTACAGTCCGACGATCT
+ NNNNNNNN
+ X
+ >
+ XX
+ NNNNNNNN
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Dual UMI design (8nt each)
+- Forward orientation
+- Balanced masking on both sides
+- Used for high-complexity libraries
+
+### XGENRNA (xGen RNA Library Prep)
+
+
+ ACACGACGCTCTTCCGATCT
+ XXXXXX
+ <
+ XXXXXXXXXXXXXXX
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Handles polyC/G artifacts from random RT priming
+- Extended masking for adaptase tail (up to 15bp)
+- Reverse orientation
+- Uses random polyC tail as pseudo-UMI
+
+### XGENMETHY (xGen Methyl-Seq)
+
+
+ ACACGACGCTCTTCCGATCT
+ XX
+ >
+ XXXXXXXXXX
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Designed for methylation sequencing
+- Trims 10 bases from read ends
+- Forward orientation
+- Includes random primer artifact removal
+
+### XGENSNMC (snmC-seq Protocol)
+
+
+ ACACGACGCTCTTCCGATCT
+ XXXXXX
+ >
+ XXXXXXXXXXXXXXX
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Specialized for single-nucleus methylome sequencing
+- Extended 15-base trimming
+- Forward orientation
+- Heavy masking for protocol artifacts
+
+### PBAT (Post-Bisulfite Adapter Tagging)
+
+
+ ACACGACGCTCTTCCGATCT
+ XXXXXX
+ <
+ XXXXXX
+ AGATCGGAAGAGCACACGTC
+
+
+
+- Used for post-bisulfite DNA sequencing
+- Random primer-based adapter addition
+- Reverse orientation
+- Symmetric masking for random tails
+
+### NEXTERA (ATAC-seq)
+
+
+ AGATGTGTATAAGAGACAG
+ >
+ CTGTCTCTTATACACATCT
+
+
+
+- Used for ATAC-seq libraries
+- Simple design without UMIs or barcodes
+- Forward orientation
+- Standard Nextera adapters
+
+### ILLUMINARNA (Illumina Stranded RNA-Seq)
+
+
+ AGATGTGTATAAGAGACAG
+ <
+ CTGTCTCTTATACACATCT
+
+
+
+- Standard Illumina stranded RNA-seq protocol
+- Reverse orientation
+- Simple design without UMIs or masking
+- Direct adapter ligation method
+
+
\ No newline at end of file