Skip to content

Commit

Permalink
Motif analysis updates
Browse files Browse the repository at this point in the history
This update includes components enabling custom motif analysis that can
be triggered at specific loci in the catalog. Initially it contains
custom motif analysis for RFC1 to differentiate pathogenic from benign
motif patterns. This is still a work in progress subject to further
refinement and interface changes.

Additional changes include
- URL input support
- New min locus coverage parameter
- Various build and docker build improvements
  • Loading branch information
ctsa committed Feb 2, 2022
1 parent 5501b27 commit 0fbcbf8
Show file tree
Hide file tree
Showing 169 changed files with 376 additions and 54,058 deletions.
60 changes: 28 additions & 32 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,72 +9,69 @@ find_package(ZLIB REQUIRED)
find_package(BZip2 REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(CURL REQUIRED)
find_package(Git REQUIRED)

if(NOT APPLE)
find_package(OpenSSL REQUIRED)
endif()

include(ExternalProject)
set(installDir ${CMAKE_CURRENT_BINARY_DIR}/install)


ExternalProject_Add(htslib
BUILD_IN_SOURCE YES
GIT_REPOSITORY "https://github.com/samtools/htslib.git"
GIT_TAG "1.10.2"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND $(MAKE)
INSTALL_COMMAND $(MAKE) install prefix=${installDir}
LOG_DOWNLOAD YES
URL https://github.com/samtools/htslib/releases/download/1.10.2/htslib-1.10.2.tar.bz2
CONFIGURE_COMMAND ./configure --prefix=${installDir} --enable-libcurl --enable-s3
BUILD_COMMAND $(MAKE) lib-static
)


# Setup user config to force same c++ compiler in boost
if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Clang"))
set(BOOST_UCONFIG "${CMAKE_BINARY_DIR}/user-config.jam")
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
file(WRITE "${BOOST_UCONFIG}" "using gcc : : \"${CMAKE_CXX_COMPILER}\" ;\n")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
file(WRITE "${BOOST_UCONFIG}" "using clang : : \"${CMAKE_CXX_COMPILER}\" ;\n")
set(B2_OPTIONS ${B2_OPTIONS} "toolset=clang")
endif()
set(BOOST_PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${BOOST_UCONFIG} tools/build/src/user-config.jam)
# Setup boost user config so that it uses the same c++ compiler as other components:
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set (BOOST_COMPILER_TAG "gcc")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set (BOOST_COMPILER_TAG "clang")
set(B2_OPTIONS ${B2_OPTIONS} "toolset=clang")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
set (BOOST_COMPILER_TAG "darwin")
set(B2_OPTIONS ${B2_OPTIONS} "toolset=clang")
endif()

if (BOOST_COMPILER_TAG)
set(BOOST_UCONFIG "${CMAKE_BINARY_DIR}/user-config.jam")
file(WRITE "${BOOST_UCONFIG}" "using ${BOOST_COMPILER_TAG} : : \"${CMAKE_CXX_COMPILER}\" ;\n")
set(BOOST_PATCH_COMMAND ${CMAKE_COMMAND} -E copy ${BOOST_UCONFIG} tools/build/src/user-config.jam)
endif()

ExternalProject_Add(Boost
BUILD_IN_SOURCE YES
URL https://boostorg.jfrog.io/artifactory/main/release/1.73.0/source/boost_1_73_0.tar.bz2
UPDATE_COMMAND ""
PATCH_COMMAND ${BOOST_PATCH_COMMAND}
CONFIGURE_COMMAND ./bootstrap.sh --prefix=${installDir}/lib
BUILD_COMMAND ./b2 install -j8 --prefix=${installDir} --with-filesystem --with-system --with-program_options link=static ${B2_OPTIONS}
CONFIGURE_COMMAND ./bootstrap.sh --prefix=${installDir}
BUILD_COMMAND ./b2 install -j8 --with-filesystem --with-system --with-program_options link=static ${B2_OPTIONS}
INSTALL_COMMAND ""
)


ExternalProject_Add(spdlog
GIT_REPOSITORY "https://github.com/gabime/spdlog.git"
GIT_TAG "v1.6.1"
UPDATE_COMMAND ""
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.6.1.tar.gz
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${installDir}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
)


ExternalProject_Add(googletest
GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "release-1.10.0"
UPDATE_COMMAND ""
URL https://github.com/google/googletest/archive/refs/tags/release-1.10.0.tar.gz
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${installDir}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
)


# CMAKE_CXX_STANDARD is required for OS X 10.15
# CMAKE_CXX_STANDARD is required for macOS 10.15
ExternalProject_Add(abseil
GIT_REPOSITORY "https://github.com/abseil/abseil-cpp"
GIT_TAG "20210324.2"
UPDATE_COMMAND ""
URL https://github.com/abseil/abseil-cpp/archive/refs/tags/20210324.2.tar.gz
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${installDir}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_CXX_STANDARD=11
Expand All @@ -88,7 +85,6 @@ ExternalProject_Add(ehunter
TEST_COMMAND "ctest"
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${installDir}
-DCMAKE_PREFIX_PATH:PATH=${installDir}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ${CMAKE_SOURCE_DIR}/ehunter
)
Expand Down
13 changes: 7 additions & 6 deletions docs/02_Installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ build the program from source follow the instructions below.

Prerequisites:

- A recent version of [GCC](https://gcc.gnu.org/) or
- A recent version of [gcc](https://gcc.gnu.org/) or
[clang](http://clang.llvm.org/) compiler supporting the C++11 standard
- The minimum gcc version is 5.1
- [CMake](https://cmake.org/) version 3.13.0 or above
- Additional development libraries, which depend on the operating system:
- Centos7
- `yum install bzip2-devel libcurl-devel libstdc++-static xz-devel zlib-devel`
- Additional development packages, which depend on the operating system:
- Centos8
- `bzip2-devel libcurl-devel libstdc++-static openssl-devel xz-devel zlib-devel`
- Ubuntu 20.04
- `apt install zlib1g libbz2-dev liblzma-dev libcurl4-openssl-dev`
- `libbz2-dev libcurl4-openssl-dev liblzma-dev libssl-dev zlib1g-dev `
- macOS 10.15
- `brew install xz`
- `xz` (from homebrew)

If the above prerequisites are satisfied, you are ready to
build the program. Note that during the build procedure, cmake will
Expand Down
35 changes: 25 additions & 10 deletions docs/03_Usage.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Usage

Expansion Hunter requires an indexed BAM or a CRAM file containing aligned reads
from a PCR-free WGS sample, a FASTA file with a reference genome assembly (which
must be the same as the one used to align the reads), and a [variant catalog
file](04_VariantCatalogFiles.md).
Expansion Hunter requires the following inputs:
1. A BAM or a CRAM file containing aligned reads from a PCR-free WGS sample.
1. The BAM or CRAM file must be sorted and indexed if using the seeking [analysis mode](#analysis-modes).
2. The BAM or CRAM file may be a local filesystem path or [URL](#url-support).
4. A FASTA file with a reference genome assembly (which must be the same as the one used to align the reads)
5. A [variant catalog file](04_VariantCatalogFiles.md).

Expansion Hunter outputs a VCF file and a JSON file with variant genotypes and
other useful information along with a BAMlet containing alignments of reads that
Expand All @@ -12,8 +14,8 @@ are largely equivalent, but the JSON file may be easier to parse
programmatically. Here is a template with the names of the required parameters.

```bash
ExpansionHunter --reads <BAM/CRAM file with aligned reads> \
--reference <FASTA file with reference genome> \
ExpansionHunter --reads <aligned reads BAM/CRAM file/URL> \
--reference <reference genome FASTA file> \
--variant-catalog <JSON file specifying variants to genotype> \
--output-prefix <Prefix for the output files>
```
Expand All @@ -26,21 +28,34 @@ optional arguments.
* `--sex <arg>` Specifies sex of the sample; can be either `male` or `female`
(default). This parameter only affects repeats on sex chromosomes.

* `--threads <int>` Specifies how many threads to can be used accelerate analysis
of large variant catalogs. Set to 1 by default. Typically seeking mode can
benefit from relatively high thread counts, while for streaming mode
there is limited benefit beyond about 16 threads.

* `--min-locus-coverage <int>` Specifies minimum read coverage depth at loci
on diploid chromosomes required to attempt genotyping. Automatically reduced
to half for loci on haploid chromosomes. The locus will be skipped if the
coverage falls below this value. Set to 10 by default.

* `--region-extension-length <int>` Specifies how far from on/off-target regions
to search for informative reads. Set to 1000 by default.

* `--analysis-mode <mode>` Specify analysis mode, which can be either `seeking` or
`streaming`. The default mode is `seeking`. See further description of analysis
modes below.

* `--threads <int>` Specifies how many threads to can be used accelerate analysis
of large variant catalogs. Set to 1 by default. Typically seeking mode can
benefit from relatively high thread counts, while for streaming mode
there is limited benefit beyond about 16 threads.

Note that the full list of program options with brief explanations can be
obtained by running `ExpansionHunter --help`.

### URL support

The aligned reads input BAM or CRAM file may be a local filesystem path or URL.
Supported protocols for URL input include ftp, https and s3. S3 bucket access
can be configured using the URL syntax and environment variables supported by
samtools/htslib.

### Analysis modes

#### Seeking mode
Expand Down
1 change: 1 addition & 0 deletions docs/05_OutputJsonFiles.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ parameters (`SampleParameters` field) and analysis results summarized by
locus (`LocusResults` field). The locus results contain these fields
* `AlleleCount` The expected number of alleles at the locus
* `Coverage` Estimated read coverage at the locus
* `FragmentLength` The fragment size estimated from read pairs fully contained in either the left or right flank of the repeat region
* `LocusId` Locus identifier
* `ReadLength` Mean read length at the locus
* `Variants` Genotypes and other information describing each variant
Expand Down
23 changes: 16 additions & 7 deletions ehunter/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.13)
project(ExpansionHunter LANGUAGES C CXX)
project(ExpansionHunter LANGUAGES CXX)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand Down Expand Up @@ -29,10 +29,14 @@ find_package(Threads REQUIRED)
find_package(spdlog REQUIRED)
find_package(absl REQUIRED)

if (NOT APPLE)
find_package(OpenSSL REQUIRED)
endif ()

find_library(htslib libhts.a)
find_library(htslib hts)

add_subdirectory(thirdparty/graph-tools-master-f421f4c)
add_subdirectory(thirdparty/graph-tools-master-0cd9399)

add_library(ExpansionHunterLib
alignment/AlignmentClassifier.hh alignment/AlignmentClassifier.cpp
Expand Down Expand Up @@ -95,19 +99,18 @@ add_library(ExpansionHunterLib


set(CTPL_INCLUDE_DIRS thirdparty/ctpl/ctpl-0.0.2)
target_include_directories(ExpansionHunterLib PUBLIC
${CMAKE_SOURCE_DIR}
target_include_directories(ExpansionHunterLib PUBLIC ${CMAKE_SOURCE_DIR})
target_include_directories(ExpansionHunterLib SYSTEM PUBLIC
${Boost_INCLUDE_DIRS}
${LIBLZMA_INCLUDE_DIRS}
${CURL_INCLUDE_DIRS}
${CTPL_INCLUDE_DIRS}
)

# Set static linking of gcc standard libraries to simplify binary distribution
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(STATIC_FLAGS -static-libgcc -static-libstdc++)
endif()

endif ()

target_link_libraries(ExpansionHunterLib PUBLIC
${STATIC_FLAGS}
Expand All @@ -122,6 +125,11 @@ target_link_libraries(ExpansionHunterLib PUBLIC
spdlog::spdlog
absl::flat_hash_set)

if (NOT APPLE)
target_link_libraries(ExpansionHunterLib PUBLIC
OpenSSL::Crypto)
endif ()

add_executable(ExpansionHunter
app/ExpansionHunter.cpp
)
Expand All @@ -148,6 +156,7 @@ add_executable(UnitTests
tests/RegionGraphTest.cpp
tests/RepeatAnalyzerTest.cpp
tests/RepeatGenotypeTest.cpp
tests/RFC1MotifAnalysisUtilTest.cpp
tests/SmallVariantGenotyperTest.cpp
tests/SoftclippingAlignerTest.cpp
tests/StrAlignTest.cpp
Expand Down
8 changes: 8 additions & 0 deletions ehunter/core/Common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

#include "core/Common.hh"

#include <regex>

using std::string;

namespace ehunter
Expand Down Expand Up @@ -97,4 +99,10 @@ std::ostream& operator<<(std::ostream& out, NumericInterval numericInterval)
return out;
}

bool isURL(const std::string& path)
{
static const std::regex url_regex(".*?://.*");
return std::regex_match(path, url_regex);
}

}
6 changes: 6 additions & 0 deletions ehunter/core/Common.hh
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,10 @@ std::ostream& operator<<(std::ostream& out, ReadType readType);
std::ostream& operator<<(std::ostream& out, AlleleCount alleleCount);
std::ostream& operator<<(std::ostream& out, NumericInterval numericInterval);

/// \brief Returns true if the path refers to a URL instead of a local file
///
/// This does not test if the URL is well formed
///
bool isURL(const std::string& path);

}
5 changes: 1 addition & 4 deletions ehunter/core/LocusStats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,7 @@ void LocusStatsCalculator::inspect(const GraphAlignment& readAlign, const GraphA
recordFragLen(readAlign, mateAlign);
}

void LocusStatsCalculator::inspectRead(const GraphAlignment& readAlign)
{
recordReadLen(readAlign);
}
void LocusStatsCalculator::inspectRead(const GraphAlignment& readAlign) { recordReadLen(readAlign); }

static AlleleCount determineExpectedAlleleCount(ChromType chromType, Sex sex)
{
Expand Down
11 changes: 9 additions & 2 deletions ehunter/core/Parameters.hh
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,11 @@ class HeuristicParameters
{
public:
HeuristicParameters(
int regionExtensionLength, int qualityCutoffForGoodBaseCall, bool skipUnaligned,
int regionExtensionLength, int minLocusCoverage, int qualityCutoffForGoodBaseCall, bool skipUnaligned,
const graphtools::AlignerType alignerType, int kmerLenForAlignment = 14, int paddingLength = 10,
int seedAffixTrimLength = 14, int orientationPredictorKmerLen = 10, int orientationPredictorMinKmerCount = 3)
: regionExtensionLength_(regionExtensionLength)
, minLocusCoverage_(minLocusCoverage)
, qualityCutoffForGoodBaseCall_(qualityCutoffForGoodBaseCall)
, skipUnaligned_(skipUnaligned)
, alignerType_(std::move(alignerType))
Expand All @@ -129,6 +130,7 @@ public:
}

int regionExtensionLength() const { return regionExtensionLength_; }
int minLocusCoverage() const { return minLocusCoverage_; }
int qualityCutoffForGoodBaseCall() const { return qualityCutoffForGoodBaseCall_; }
bool skipUnaligned() const { return skipUnaligned_; }
graphtools::AlignerType alignerType() const { return alignerType_; }
Expand All @@ -140,6 +142,7 @@ public:

private:
int regionExtensionLength_;
int minLocusCoverage_;
int qualityCutoffForGoodBaseCall_;
bool skipUnaligned_;
graphtools::AlignerType alignerType_;
Expand All @@ -153,12 +156,16 @@ private:
// Per-locus parameters (settable from variant catalog) controlling genotyping
struct GenotyperParameters
{
GenotyperParameters(int minLocusCoverage)
: minLocusCoverage(minLocusCoverage)
{
}
// Base error rate assumed in SNV key-allele genotyping model
double errorRate = 0.02;
// Threshold to call SNV key-allele confidently present / absent
double likelihoodRatioThreshold = 10000;
// Minimal estimated locus coverage (depth) to attempt genotyping
double minLocusCoverage = 10;
double minLocusCoverage;
// Minimal number of reads spanning a variant breakpoint
int minBreakpointSpanningReads = 5;
};
Expand Down
Loading

0 comments on commit 0fbcbf8

Please sign in to comment.