EBIvariation · apriltuesday · Oct 29, 2024 · Oct 30, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/tasks/eva_3660/logback.xml b/tasks/eva_3660/logback.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+    <appender name="console" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder>
+            <pattern>%d %5p %40.40c:%4L - %m%n</pattern>
+        </encoder>
+    </appender>
+
+    <logger name="org.springframework" level="info"/>
+    <logger name="uk.ac.ebi.eva" level="info"/>
+    <logger name="eva3660" level="info"/>
+
+    <root level="error">
+        <appender-ref ref="console"/>
+    </root>
+</configuration>
diff --git a/tasks/eva_3660/pom.xml b/tasks/eva_3660/pom.xml
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>uk.ac.ebi.eva</groupId>
+    <artifactId>eva_3660</artifactId>
+    <version>0.1-SNAPSHOT</version>
+    <packaging>jar</packaging>
+
+    <dependencies>
+        <dependency>
+            <groupId>uk.ac.ebi.eva</groupId>
+            <artifactId>eva-pipeline</artifactId>
+            <version>2.9.8-SNAPSHOT</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>com.fasterxml.jackson.core</groupId>
+                    <artifactId>jackson-databind</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>uk.ac.ebi.eva</groupId>
+            <artifactId>eva-accession-core</artifactId>
+            <version>0.6.45-SNAPSHOT</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.groovy</groupId>
+            <artifactId>groovy-all</artifactId>
+            <version>4.0.3</version>
+            <type>pom</type>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.9</version>
+        </dependency>
+        <dependency>
+            <groupId>uk.ac.ebi.eva</groupId>
+            <artifactId>eva-common-groovyutils</artifactId>
+            <version>0.2-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.spockframework</groupId>
+            <artifactId>spock-core</artifactId>
+            <version>2.3-groovy-4.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.hsqldb</groupId>
+            <artifactId>hsqldb</artifactId>
+            <version>2.3.4</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.codehaus.gmavenplus</groupId>
+                <artifactId>gmavenplus-plugin</artifactId>
+                <version>1.13.1</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>addSources</goal>
+                            <goal>addTestSources</goal>
+                            <goal>compile</goal>
+                            <goal>compileTests</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <sources>
+                        <source>
+                            <directory>${project.basedir}/src</directory>
+                            <includes>
+                                <include>**/*.groovy</include>
+                            </includes>
+                        </source>
+                    </sources>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>exec-maven-plugin</artifactId>
+                <version>3.1.0</version>
+            </plugin>
+        </plugins>
+    </build>
+
+    <repositories>
+        <repository>
+            <id>cloudsmith</id>
+            <url>https://dl.cloudsmith.io/public/ebivariation/packages/maven/</url>
+            <releases>
+                <enabled>true</enabled>
+                <updatePolicy>always</updatePolicy>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+                <updatePolicy>always</updatePolicy>
+            </snapshots>
+        </repository>
+    </repositories>
+
+    <properties>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <eva.mongo.host.test>localhost</eva.mongo.host.test>
+    </properties>
+
+</project>
diff --git a/tasks/eva_3660/src/main/groovy/eva3660/ContigRenamingProcessor.groovy b/tasks/eva_3660/src/main/groovy/eva3660/ContigRenamingProcessor.groovy
@@ -0,0 +1,44 @@
+package eva3660
+
+import uk.ac.ebi.eva.accession.core.contig.ContigMapping
+import uk.ac.ebi.eva.accession.core.contig.ContigSynonyms
+
+import java.nio.file.Path
+
+import static org.springframework.util.StringUtils.hasText;
+
+class ContigRenamingProcessor {
+
+    private ContigMapping contigMapping
+
+    ContigRenamingProcessor(Path pathToAssemblyReport) {
+        contigMapping = new ContigMapping("file:" + pathToAssemblyReport.toAbsolutePath().toString())
+    }
+
+    String getInsdcAccession(String contigName) {
+        ContigSynonyms contigSynonyms = contigMapping.getContigSynonyms(contigName)
+
+        StringBuilder message = new StringBuilder()
+        if (isGenbankReplacementPossible(contigName, contigSynonyms, message)) {
+            return contigSynonyms.getGenBank()
+        }
+        throw new IllegalArgumentException(message.toString())
+    }
+
+    /* More lenient version of this method in accessioning pipeline:
+     * https://github.com/EBIvariation/eva-accession/blob/master/eva-accession-core/src/main/java/uk/ac/ebi/eva/accession/core/contig/ContigMapping.java#L169
+     */
+    boolean isGenbankReplacementPossible(String contig, ContigSynonyms contigSynonyms, StringBuilder reason) {
+        if (contigSynonyms == null) {
+            reason.append("Contig '" + contig + "' was not found in the assembly report!");
+            return false;
+        }
+
+        if (!hasText(contigSynonyms.getGenBank())) {
+            reason.append("No Genbank equivalent found for contig '" + contig + "' in the assembly report");
+            return false;
+        }
+        return true;
+    }
+
+}
diff --git a/tasks/eva_3660/src/main/groovy/eva3660/NormalisationProcessor.groovy b/tasks/eva_3660/src/main/groovy/eva3660/NormalisationProcessor.groovy
@@ -0,0 +1,119 @@
+package eva3660
+
+import uk.ac.ebi.eva.accession.core.batch.io.FastaSequenceReader
+
+import java.nio.file.Path
+
+class NormalisationProcessor {
+
+    private FastaSequenceReader fastaReader
+
+    NormalisationProcessor(Path pathToFasta) {
+        fastaReader = new FastaSequenceReader(pathToFasta)
+    }
+
+    /**
+     * Normalisation with specified reference, alternate, secondary alternate alleles, and maf allele.
+     * Also truncate common leading context allele if present (i.e. allows empty alleles).
+     *
+     * @param contig Name of contig as found in FASTA
+     * @param valsforNorm Values required for normalisation (position & various alleles) Position of variant
+     * @return normalised values
+     */
+    ValuesForNormalisation normaliseAndTruncate(String contig, ValuesForNormalisation valsForNorm) {
+        // mafAllele can be null
+        boolean mafIsNull = false
+        List<String> allelesToNorm = [valsForNorm.reference, valsForNorm.alternate, valsForNorm.mafAllele] + valsForNorm.secondaryAlternates
+        if (Objects.isNull(valsForNorm.mafAllele)) {
+            mafIsNull = true
+            allelesToNorm = [valsForNorm.reference, valsForNorm.alternate] + valsForNorm.secondaryAlternates
+        }
+        def (newStart, newEnd, newLength, newAlleles) = normalise(contig, valsForNorm.start, valsForNorm.end, valsForNorm.length, allelesToNorm)
+        // Check final base before initial base when truncating, to mirror eva-pipeline load:
+        //  https://github.com/EBIvariation/eva-pipeline/blob/master/src/main/java/uk/ac/ebi/eva/pipeline/io/mappers/VariantVcfFactory.java#L185
+        if (allSameEnd(newAlleles)) {
+            (newEnd, newLength, newAlleles) = truncateRightmost(newEnd, newLength, newAlleles)
+        }
+        else if (allSameStart(newAlleles)) {
+            (newStart, newLength, newAlleles) = truncateLeftmost(newStart, newLength, newAlleles)
+        }
+        String newReference = newAlleles.pop()
+        String newAlternate = newAlleles.pop()
+        String newMafAllele = mafIsNull ? null : newAlleles.pop()
+        return new ValuesForNormalisation(newStart, newEnd, newLength, newReference, newAlternate, newMafAllele, newAlleles)
+    }
+
+    /**
+     * Normalise alleles to be parsimonious and left-aligned.
+     * See here: https://genome.sph.umich.edu/wiki/Variant_Normalization
+     *
+     * @param contig Name of contig as found in FASTA
+     * @param start Start coordinate of variant
+     * @param end End coordinate of variant
+     * @param length Length of variant
+     * @param alleles List of alleles
+     * @return normalised coordinates and list of normalised alleles (guaranteed to preserve input order)
+     */
+    Tuple normalise(String contig, int start, int end, int length, List<String> alleles) {
+        // Allow for initially empty alleles
+        def (newStart, newLength, newAlleles) = addContextIfEmpty(contig, start, length, alleles)
+        def newEnd = end
+        // While all alleles end in same nucleotide
+        while (allSameEnd(newAlleles)) {
+            // Truncate rightmost nucleotide
+            (newEnd, newLength, newAlleles) = truncateRightmost(newEnd, newLength, newAlleles)
+            // If exists an empty allele, extend alleles 1 to the left
+            (newStart, newLength, newAlleles) = addContextIfEmpty(contig, newStart, newLength, newAlleles)
+        }
+        // While all alleles start with same nucleotide and have length 2 or more
+        while (allSameStart(newAlleles) && allLengthAtLeastTwo(newAlleles)) {
+            // Truncate leftmost nucleotide
+            (newStart, newLength, newAlleles) = truncateLeftmost(newStart, newLength, newAlleles)
+        }
+        return new Tuple(newStart, newEnd, newLength, newAlleles)
+    }
+
+    private Tuple addContextIfEmpty(String contig, int start, int length, List<String> alleles) {
+        def newStart = start
+        def newLength = length
+        def newAlleles = alleles
+        // If already at the edge of the contig, do nothing
+        if (start == 1) {
+            return new Tuple(newStart, newLength, newAlleles)
+        }
+        def existEmptyAlleles = alleles.stream().any{ it.size() < 1 }
+        if (existEmptyAlleles) {
+            // Extend alleles 1 to the left
+            newStart--
+            newLength++
+            def contextBase = fastaReader.getSequenceToUpperCase(contig, newStart, newStart)
+            newAlleles = newAlleles.stream().collect { "${contextBase}${it}" }
+        }
+        return new Tuple(newStart, newLength, newAlleles)
+    }
+
+    private boolean allSameEnd(List<String> alleles) {
+        return alleles.stream().collect{it[-1] }.toSet().size() == 1
+    }
+
+    private boolean allSameStart(List<String> alleles) {
+        return alleles.stream().collect{it[0] }.toSet().size() == 1
+    }
+
+    private boolean allLengthAtLeastTwo(List<String> alleles) {
+        return alleles.stream().allMatch{it.size() >= 2 }
+    }
+
+    private Tuple truncateRightmost(int end, int length, List<String> alleles) {
+        return new Tuple(--end, --length, alleles.stream().collect{ it.substring(0, it.size()-1) })
+    }
+
+    private Tuple truncateLeftmost(int start, int length, List<String> alleles) {
+        return new Tuple(++start, --length, alleles.stream().collect{ it.substring(1) })
+    }
+
+    void close() {
+        fastaReader.close()
+    }
+
+}