Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3660: Script to detect and update variants in variant warehouse that need normalisation #180

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
16 changes: 16 additions & 0 deletions tasks/eva_3660/logback.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<appender name="console" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d %5p %40.40c:%4L - %m%n</pattern>
</encoder>
</appender>

<logger name="org.springframework" level="info"/>
<logger name="uk.ac.ebi.eva" level="info"/>
<logger name="eva3660" level="info"/>

<root level="error">
<appender-ref ref="console"/>
</root>
</configuration>
116 changes: 116 additions & 0 deletions tasks/eva_3660/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>uk.ac.ebi.eva</groupId>
<artifactId>eva_3660</artifactId>
<version>0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<dependencies>
<dependency>
<groupId>uk.ac.ebi.eva</groupId>
<artifactId>eva-pipeline</artifactId>
<version>2.9.8-SNAPSHOT</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>uk.ac.ebi.eva</groupId>
<artifactId>eva-accession-core</artifactId>
<version>0.6.45-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>4.0.3</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>uk.ac.ebi.eva</groupId>
<artifactId>eva-common-groovyutils</artifactId>
<version>0.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.spockframework</groupId>
<artifactId>spock-core</artifactId>
<version>2.3-groovy-4.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hsqldb</groupId>
<artifactId>hsqldb</artifactId>
<version>2.3.4</version>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.codehaus.gmavenplus</groupId>
<artifactId>gmavenplus-plugin</artifactId>
<version>1.13.1</version>
<executions>
<execution>
<goals>
<goal>addSources</goal>
<goal>addTestSources</goal>
<goal>compile</goal>
<goal>compileTests</goal>
</goals>
</execution>
</executions>
<configuration>
<sources>
<source>
<directory>${project.basedir}/src</directory>
<includes>
<include>**/*.groovy</include>
</includes>
</source>
</sources>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.1.0</version>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>cloudsmith</id>
<url>https://dl.cloudsmith.io/public/ebivariation/packages/maven/</url>
<releases>
<enabled>true</enabled>
<updatePolicy>always</updatePolicy>
</releases>
<snapshots>
<enabled>true</enabled>
<updatePolicy>always</updatePolicy>
</snapshots>
</repository>
</repositories>

<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<eva.mongo.host.test>localhost</eva.mongo.host.test>
</properties>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package eva3660

import uk.ac.ebi.eva.accession.core.contig.ContigMapping
import uk.ac.ebi.eva.accession.core.contig.ContigSynonyms

import java.nio.file.Path

import static org.springframework.util.StringUtils.hasText;

class ContigRenamingProcessor {

private ContigMapping contigMapping

ContigRenamingProcessor(Path pathToAssemblyReport) {
contigMapping = new ContigMapping("file:" + pathToAssemblyReport.toAbsolutePath().toString())
}

String getInsdcAccession(String contigName) {
ContigSynonyms contigSynonyms = contigMapping.getContigSynonyms(contigName)

StringBuilder message = new StringBuilder()
if (isGenbankReplacementPossible(contigName, contigSynonyms, message)) {
return contigSynonyms.getGenBank()
}
throw new IllegalArgumentException(message.toString())
}

/* More lenient version of this method in accessioning pipeline:
* https://github.com/EBIvariation/eva-accession/blob/master/eva-accession-core/src/main/java/uk/ac/ebi/eva/accession/core/contig/ContigMapping.java#L169
*/
boolean isGenbankReplacementPossible(String contig, ContigSynonyms contigSynonyms, StringBuilder reason) {
if (contigSynonyms == null) {
reason.append("Contig '" + contig + "' was not found in the assembly report!");
return false;
}

if (!hasText(contigSynonyms.getGenBank())) {
reason.append("No Genbank equivalent found for contig '" + contig + "' in the assembly report");
return false;
}
return true;
}

}
119 changes: 119 additions & 0 deletions tasks/eva_3660/src/main/groovy/eva3660/NormalisationProcessor.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package eva3660

import uk.ac.ebi.eva.accession.core.batch.io.FastaSequenceReader

import java.nio.file.Path

class NormalisationProcessor {

private FastaSequenceReader fastaReader

NormalisationProcessor(Path pathToFasta) {
fastaReader = new FastaSequenceReader(pathToFasta)
}

/**
* Normalisation with specified reference, alternate, secondary alternate alleles, and maf allele.
* Also truncate common leading context allele if present (i.e. allows empty alleles).
*
* @param contig Name of contig as found in FASTA
* @param valsforNorm Values required for normalisation (position & various alleles) Position of variant
* @return normalised values
*/
ValuesForNormalisation normaliseAndTruncate(String contig, ValuesForNormalisation valsForNorm) {
// mafAllele can be null
boolean mafIsNull = false
List<String> allelesToNorm = [valsForNorm.reference, valsForNorm.alternate, valsForNorm.mafAllele] + valsForNorm.secondaryAlternates
if (Objects.isNull(valsForNorm.mafAllele)) {
mafIsNull = true
allelesToNorm = [valsForNorm.reference, valsForNorm.alternate] + valsForNorm.secondaryAlternates
}
def (newStart, newEnd, newLength, newAlleles) = normalise(contig, valsForNorm.start, valsForNorm.end, valsForNorm.length, allelesToNorm)
// Check final base before initial base when truncating, to mirror eva-pipeline load:
// https://github.com/EBIvariation/eva-pipeline/blob/master/src/main/java/uk/ac/ebi/eva/pipeline/io/mappers/VariantVcfFactory.java#L185
if (allSameEnd(newAlleles)) {
(newEnd, newLength, newAlleles) = truncateRightmost(newEnd, newLength, newAlleles)
}
else if (allSameStart(newAlleles)) {
(newStart, newLength, newAlleles) = truncateLeftmost(newStart, newLength, newAlleles)
}
String newReference = newAlleles.pop()
String newAlternate = newAlleles.pop()
String newMafAllele = mafIsNull ? null : newAlleles.pop()
return new ValuesForNormalisation(newStart, newEnd, newLength, newReference, newAlternate, newMafAllele, newAlleles)
}

/**
* Normalise alleles to be parsimonious and left-aligned.
* See here: https://genome.sph.umich.edu/wiki/Variant_Normalization
*
* @param contig Name of contig as found in FASTA
* @param start Start coordinate of variant
* @param end End coordinate of variant
* @param length Length of variant
* @param alleles List of alleles
* @return normalised coordinates and list of normalised alleles (guaranteed to preserve input order)
*/
Tuple normalise(String contig, int start, int end, int length, List<String> alleles) {
// Allow for initially empty alleles
def (newStart, newLength, newAlleles) = addContextIfEmpty(contig, start, length, alleles)
def newEnd = end
// While all alleles end in same nucleotide
while (allSameEnd(newAlleles)) {
// Truncate rightmost nucleotide
(newEnd, newLength, newAlleles) = truncateRightmost(newEnd, newLength, newAlleles)
// If exists an empty allele, extend alleles 1 to the left
(newStart, newLength, newAlleles) = addContextIfEmpty(contig, newStart, newLength, newAlleles)
}
// While all alleles start with same nucleotide and have length 2 or more
while (allSameStart(newAlleles) && allLengthAtLeastTwo(newAlleles)) {
// Truncate leftmost nucleotide
(newStart, newLength, newAlleles) = truncateLeftmost(newStart, newLength, newAlleles)
}
return new Tuple(newStart, newEnd, newLength, newAlleles)
}

private Tuple addContextIfEmpty(String contig, int start, int length, List<String> alleles) {
def newStart = start
def newLength = length
def newAlleles = alleles
// If already at the edge of the contig, do nothing
if (start == 1) {
return new Tuple(newStart, newLength, newAlleles)
}
def existEmptyAlleles = alleles.stream().any{ it.size() < 1 }
if (existEmptyAlleles) {
// Extend alleles 1 to the left
newStart--
newLength++
def contextBase = fastaReader.getSequenceToUpperCase(contig, newStart, newStart)
newAlleles = newAlleles.stream().collect { "${contextBase}${it}" }
}
return new Tuple(newStart, newLength, newAlleles)
}

private boolean allSameEnd(List<String> alleles) {
return alleles.stream().collect{it[-1] }.toSet().size() == 1
}

private boolean allSameStart(List<String> alleles) {
return alleles.stream().collect{it[0] }.toSet().size() == 1
}

private boolean allLengthAtLeastTwo(List<String> alleles) {
return alleles.stream().allMatch{it.size() >= 2 }
}

private Tuple truncateRightmost(int end, int length, List<String> alleles) {
return new Tuple(--end, --length, alleles.stream().collect{ it.substring(0, it.size()-1) })
}

private Tuple truncateLeftmost(int start, int length, List<String> alleles) {
return new Tuple(++start, --length, alleles.stream().collect{ it.substring(1) })
}

void close() {
fastaReader.close()
}

}
Loading