From 335954df1128f66aeaebffeee214b2663687510a Mon Sep 17 00:00:00 2001 From: LeonardoGonzales Date: Tue, 1 Oct 2024 11:25:32 +0100 Subject: [PATCH] Improvements in the fasta Format parser and sources attribute --- .../core/uniparc/UniParcCrossReference.java | 2 + .../uniparc/UniParcProteomeFastaParser.java | 123 ++++++++++-------- .../UniParcProteomeFastaParserTest.java | 14 +- 3 files changed, 81 insertions(+), 58 deletions(-) diff --git a/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java b/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java index 0871d624f..783081643 100644 --- a/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java +++ b/core-domain/src/main/java/org/uniprot/core/uniparc/UniParcCrossReference.java @@ -11,6 +11,8 @@ */ public interface UniParcCrossReference extends CrossReference { + public static final String PROPERTY_SOURCES = "sources"; + int getVersionI(); Integer getVersion(); diff --git a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java index 15c695823..25025789d 100644 --- a/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java +++ b/core-parser/src/main/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParser.java @@ -11,70 +11,94 @@ import java.util.concurrent.atomic.AtomicReference; import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence; +import static org.uniprot.core.util.Utils.*; public class UniParcProteomeFastaParser { private static final Set uniProtDatabases = Set.of( UniParcDatabase.SWISSPROT, UniParcDatabase.TREMBL, UniParcDatabase.SWISSPROT_VARSPLIC); - public static String toFasta(UniParcEntry entry, String proteomeID) { + public static String toFasta(UniParcEntry entry) { String id = entry.getUniParcId().getValue(); - List proteinName = new ArrayList<>(); - List geneNames = new ArrayList<>(); - List accessions = new ArrayList<>(); - Set sourceIds = new HashSet<>(); - Set component = new HashSet<>(); - AtomicReference organism = new AtomicReference<>(); - AtomicReference proteomeIdValue = new AtomicReference<>(); + List uniProtXrefs = new ArrayList<>(); + List sourceXrefs = new ArrayList<>(); + boolean hasActive = false; + boolean hasSourceActive = false; + String proteomeId = null; + for(UniParcCrossReference xref: entry.getUniParcCrossReferences()){ + if(uniProtDatabases.contains(xref.getDatabase())){ + uniProtXrefs.add(xref); + if(xref.isActive()){ + hasActive = true; + } + } else { + sourceXrefs.add(xref); + proteomeId = xref.getProteomeId(); + if(xref.isActive()){ + hasSourceActive = true; + } + } + } + StringBuilder sb = new StringBuilder(); + if(!uniProtXrefs.isEmpty()){ + sb.append(getFastaHeader(uniProtXrefs, hasActive, id, proteomeId, false)); + } else { + sb.append(getFastaHeader(sourceXrefs, hasSourceActive, id, proteomeId, true)); + } + sb.append("\n"); + sb.append(parseSequence(entry.getSequence().getValue())); + return sb.toString(); + } - entry.getUniParcCrossReferences().stream() - .filter(xref -> uniProtDatabases.contains(xref.getDatabase())) - .sorted(Comparator.comparing(UniParcCrossReference::isActive,Comparator.reverseOrder())) - .forEach(xref -> { - if(uniProtDatabases.contains(xref.getDatabase())) { - if (Utils.notNullNotEmpty(xref.getId())) { - accessions.add(xref.getId()); - } - if (Utils.notNullNotEmpty(xref.getProteinName())) { - proteinName.add(xref.getProteinName()); - } - if (Utils.notNullNotEmpty(xref.getGeneName())) { - geneNames.add(xref.getGeneName()); - } - if (Utils.notNull(xref.getOrganism())) { - organism.set(xref.getOrganism()); - } - if (Utils.notNullNotEmpty(xref.getProperties())) { - xref.getProperties().stream() - .filter(p -> "source".equals(p.getKey())) - .map(Property::getValue) - .forEach(source -> { - String[] sources = source.split(":"); - if(sources.length > 0){ - sourceIds.add(sources[0]); - } - if(sources.length > 1){ - proteomeIdValue.set(sources[1]); + private static StringBuilder getFastaHeader(List xrefs, boolean hasActive, String id, String proteomeId, boolean isSource) { + Set proteinName = new LinkedHashSet<>(); + Set geneNames = new LinkedHashSet<>(); + Set accessions = new LinkedHashSet<>(); + Set sourceIds = new LinkedHashSet<>(); + Set component = new LinkedHashSet<>(); + Organism organism = null; + for(UniParcCrossReference xref: xrefs) { + if (xref.isActive() == hasActive) { + addOrIgnoreNull(xref.getProteinName(), proteinName); + addOrIgnoreNull(xref.getGeneName(), geneNames); + organism = xref.getOrganism(); + + if (isSource) { + addOrIgnoreNull(xref.getId(), sourceIds); + addOrIgnoreNull(xref.getComponent(), component); + } else { + addOrIgnoreNull(xref.getId(), accessions); + if (notNullNotEmpty(xref.getProperties())) { + xref.getProperties().stream() + .filter(p -> UniParcCrossReference.PROPERTY_SOURCES.equals(p.getKey())) + .map(Property::getValue) + .forEach(value -> { + String[] sources = value.split(","); + for (String source : sources) { + String[] ids = source.split(":"); + if (ids.length > 1 && proteomeId.equals(ids[1])) { + sourceIds.add(ids[0]); } - if(sources.length > 2){ - component.add(sources[2]); + if (ids.length > 2 && proteomeId.equals(ids[1])) { + component.add(ids[2]); } - }); - } + } + }); } - }); - + } + } + } StringBuilder sb = new StringBuilder(); sb.append(">").append(id); if(!proteinName.isEmpty()){ sb.append(" ").append(String.join("|", proteinName)); } - if (Utils.notNull(organism.get())) { - if (organism.get().hasScientificName()) { - sb.append(" OS=").append(organism.get().getScientificName()); + if (notNull(organism)) { + if (organism.hasScientificName()) { + sb.append(" OS=").append(organism.getScientificName()); } - sb.append(" OX=").append(organism.get().getTaxonId()); + sb.append(" OX=").append(organism.getTaxonId()); } if(!geneNames.isEmpty()){ @@ -86,13 +110,10 @@ public static String toFasta(UniParcEntry entry, String proteomeID) { if(!sourceIds.isEmpty()){ sb.append(" SS=").append(String.join("|", sourceIds)); } - sb.append(" UP=").append(proteomeIdValue.get()); + sb.append(" UP=").append(proteomeId); if(!component.isEmpty()){ sb.append(":").append(String.join("|", component)); } - - sb.append("\n"); - sb.append(parseSequence(entry.getSequence().getValue())); - return sb.toString(); + return sb; } } diff --git a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java index 102e87cba..b9a115d32 100644 --- a/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java +++ b/core-parser/src/test/java/org/uniprot/core/parser/fasta/uniparc/UniParcProteomeFastaParserTest.java @@ -20,7 +20,7 @@ class UniParcProteomeFastaParserTest { @Test void toFastaFullSingleValues() { UniParcEntry entry = create(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, "UP000005640"); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A08|EMBL:CQR81549 UP=UP000005640:Chromosome 1 OX=9606 OS=Homo Sapiens AC=P12345\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -42,7 +42,7 @@ void toFastaFullMultiValues() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345|EMBL_TPA:XP54321 UP=UP000005640:C1|C2 OX=9606 OS=Homo Sapiens AC=P21802|P12345\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -62,7 +62,7 @@ void toFastaWithoutAccessions() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C1 OX=9606 OS=Homo Sapiens\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -81,7 +81,7 @@ void toFastaWithoutComponent() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640 OX=9606 OS=Homo Sapiens\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -99,7 +99,7 @@ void toFastaWithoutOrganism() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C8\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -117,7 +117,7 @@ void toFastaWithoutSource() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09 UP=UP000005640:C9\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" + @@ -136,7 +136,7 @@ void toFastaFilterInactiveSources() { .uniParcCrossReferencesSet(xrefs) .sequence(getSequence()) .build(); - String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId); + String fasta = UniParcProteomeFastaParser.toFasta(entry); String expected = ">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C5\n" + "MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +