Skip to content

Commit

Permalink
Improvements in the fasta Format parser and sources attribute
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonardoGonzales committed Oct 1, 2024
1 parent 01539d9 commit 335954d
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
*/
public interface UniParcCrossReference extends CrossReference<UniParcDatabase> {

public static final String PROPERTY_SOURCES = "sources";

int getVersionI();

Integer getVersion();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,70 +11,94 @@
import java.util.concurrent.atomic.AtomicReference;

import static org.uniprot.core.parser.fasta.FastaUtils.parseSequence;
import static org.uniprot.core.util.Utils.*;

public class UniParcProteomeFastaParser {

private static final Set<UniParcDatabase> uniProtDatabases = Set.of(
UniParcDatabase.SWISSPROT, UniParcDatabase.TREMBL, UniParcDatabase.SWISSPROT_VARSPLIC);

public static String toFasta(UniParcEntry entry, String proteomeID) {
public static String toFasta(UniParcEntry entry) {
String id = entry.getUniParcId().getValue();

List<String> proteinName = new ArrayList<>();
List<String> geneNames = new ArrayList<>();
List<String> accessions = new ArrayList<>();
Set<String> sourceIds = new HashSet<>();
Set<String> component = new HashSet<>();
AtomicReference<Organism> organism = new AtomicReference<>();
AtomicReference<String> proteomeIdValue = new AtomicReference<>();
List<UniParcCrossReference> uniProtXrefs = new ArrayList<>();
List<UniParcCrossReference> sourceXrefs = new ArrayList<>();
boolean hasActive = false;
boolean hasSourceActive = false;
String proteomeId = null;
for(UniParcCrossReference xref: entry.getUniParcCrossReferences()){
if(uniProtDatabases.contains(xref.getDatabase())){
uniProtXrefs.add(xref);
if(xref.isActive()){
hasActive = true;
}
} else {
sourceXrefs.add(xref);
proteomeId = xref.getProteomeId();
if(xref.isActive()){
hasSourceActive = true;
}
}
}
StringBuilder sb = new StringBuilder();
if(!uniProtXrefs.isEmpty()){
sb.append(getFastaHeader(uniProtXrefs, hasActive, id, proteomeId, false));
} else {
sb.append(getFastaHeader(sourceXrefs, hasSourceActive, id, proteomeId, true));
}
sb.append("\n");
sb.append(parseSequence(entry.getSequence().getValue()));
return sb.toString();
}

entry.getUniParcCrossReferences().stream()
.filter(xref -> uniProtDatabases.contains(xref.getDatabase()))
.sorted(Comparator.comparing(UniParcCrossReference::isActive,Comparator.reverseOrder()))
.forEach(xref -> {
if(uniProtDatabases.contains(xref.getDatabase())) {
if (Utils.notNullNotEmpty(xref.getId())) {
accessions.add(xref.getId());
}
if (Utils.notNullNotEmpty(xref.getProteinName())) {
proteinName.add(xref.getProteinName());
}
if (Utils.notNullNotEmpty(xref.getGeneName())) {
geneNames.add(xref.getGeneName());
}
if (Utils.notNull(xref.getOrganism())) {
organism.set(xref.getOrganism());
}
if (Utils.notNullNotEmpty(xref.getProperties())) {
xref.getProperties().stream()
.filter(p -> "source".equals(p.getKey()))
.map(Property::getValue)
.forEach(source -> {
String[] sources = source.split(":");
if(sources.length > 0){
sourceIds.add(sources[0]);
}
if(sources.length > 1){
proteomeIdValue.set(sources[1]);
private static StringBuilder getFastaHeader(List<UniParcCrossReference> xrefs, boolean hasActive, String id, String proteomeId, boolean isSource) {
Set<String> proteinName = new LinkedHashSet<>();
Set<String> geneNames = new LinkedHashSet<>();
Set<String> accessions = new LinkedHashSet<>();
Set<String> sourceIds = new LinkedHashSet<>();
Set<String> component = new LinkedHashSet<>();
Organism organism = null;
for(UniParcCrossReference xref: xrefs) {
if (xref.isActive() == hasActive) {
addOrIgnoreNull(xref.getProteinName(), proteinName);
addOrIgnoreNull(xref.getGeneName(), geneNames);
organism = xref.getOrganism();

if (isSource) {
addOrIgnoreNull(xref.getId(), sourceIds);
addOrIgnoreNull(xref.getComponent(), component);
} else {
addOrIgnoreNull(xref.getId(), accessions);
if (notNullNotEmpty(xref.getProperties())) {
xref.getProperties().stream()
.filter(p -> UniParcCrossReference.PROPERTY_SOURCES.equals(p.getKey()))
.map(Property::getValue)
.forEach(value -> {
String[] sources = value.split(",");
for (String source : sources) {
String[] ids = source.split(":");
if (ids.length > 1 && proteomeId.equals(ids[1])) {
sourceIds.add(ids[0]);
}
if(sources.length > 2){
component.add(sources[2]);
if (ids.length > 2 && proteomeId.equals(ids[1])) {
component.add(ids[2]);
}
});
}
}
});
}
});

}
}
}
StringBuilder sb = new StringBuilder();
sb.append(">").append(id);
if(!proteinName.isEmpty()){
sb.append(" ").append(String.join("|", proteinName));
}
if (Utils.notNull(organism.get())) {
if (organism.get().hasScientificName()) {
sb.append(" OS=").append(organism.get().getScientificName());
if (notNull(organism)) {
if (organism.hasScientificName()) {
sb.append(" OS=").append(organism.getScientificName());
}
sb.append(" OX=").append(organism.get().getTaxonId());
sb.append(" OX=").append(organism.getTaxonId());
}

if(!geneNames.isEmpty()){
Expand All @@ -86,13 +110,10 @@ public static String toFasta(UniParcEntry entry, String proteomeID) {
if(!sourceIds.isEmpty()){
sb.append(" SS=").append(String.join("|", sourceIds));
}
sb.append(" UP=").append(proteomeIdValue.get());
sb.append(" UP=").append(proteomeId);
if(!component.isEmpty()){
sb.append(":").append(String.join("|", component));
}

sb.append("\n");
sb.append(parseSequence(entry.getSequence().getValue()));
return sb.toString();
return sb;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class UniParcProteomeFastaParserTest {
@Test
void toFastaFullSingleValues() {
UniParcEntry entry = create();
String fasta = UniParcProteomeFastaParser.toFasta(entry, "UP000005640");
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A08|EMBL:CQR81549 UP=UP000005640:Chromosome 1 OX=9606 OS=Homo Sapiens AC=P12345\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand All @@ -42,7 +42,7 @@ void toFastaFullMultiValues() {
.uniParcCrossReferencesSet(xrefs)
.sequence(getSequence())
.build();
String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId);
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A09|EMBL_CON:XP12345|EMBL_TPA:XP54321 UP=UP000005640:C1|C2 OX=9606 OS=Homo Sapiens AC=P21802|P12345\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand All @@ -62,7 +62,7 @@ void toFastaWithoutAccessions() {
.uniParcCrossReferencesSet(xrefs)
.sequence(getSequence())
.build();
String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId);
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C1 OX=9606 OS=Homo Sapiens\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand All @@ -81,7 +81,7 @@ void toFastaWithoutComponent() {
.uniParcCrossReferencesSet(xrefs)
.sequence(getSequence())
.build();
String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId);
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640 OX=9606 OS=Homo Sapiens\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand All @@ -99,7 +99,7 @@ void toFastaWithoutOrganism() {
.uniParcCrossReferencesSet(xrefs)
.sequence(getSequence())
.build();
String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId);
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C8\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand All @@ -117,7 +117,7 @@ void toFastaWithoutSource() {
.uniParcCrossReferencesSet(xrefs)
.sequence(getSequence())
.build();
String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId);
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A09 UP=UP000005640:C9\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand All @@ -136,7 +136,7 @@ void toFastaFilterInactiveSources() {
.uniParcCrossReferencesSet(xrefs)
.sequence(getSequence())
.build();
String fasta = UniParcProteomeFastaParser.toFasta(entry, proteomeId);
String fasta = UniParcProteomeFastaParser.toFasta(entry);
String expected =
">UPI0000083A09|EMBL_CON:XP12345 UP=UP000005640:C5\n" +
"MSMAMARALATLGRLRYRVSGQLPLLDETAIEVMAGGQFLDGRKAREELGFFSTTALDDT\n" +
Expand Down

0 comments on commit 335954d

Please sign in to comment.