Skip to content

Commit

Permalink
Merge pull request #241 from tcezard/EVA3281_synonyms_on_one_line_are…
Browse files Browse the repository at this point in the history
…_allowed

EVA-3281 - Allow synonyms to be the same value if they are on the same line
  • Loading branch information
tcezard authored Apr 30, 2024
2 parents b1d57e5 + c4bf905 commit f134b4a
Show file tree
Hide file tree
Showing 9 changed files with 1,339 additions and 1 deletion.
6 changes: 5 additions & 1 deletion inc/assembly_report/assembly_report.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ namespace ebi

void add_synonym(std::string synonym)
{
synonyms.push_back(synonym);
// Only add the synonym if there isn't one already
if (std::find(synonyms.begin(), synonyms.end(), synonym) == synonyms.end())
{
synonyms.push_back(synonym);
}
}

private:
Expand Down
8 changes: 8 additions & 0 deletions test/assembly_report/assembly_report_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ namespace ebi
CHECK_NOTHROW(synonyms_map.parse_assembly_report(input));
}

SECTION("Parsing assembly report with same value in one line")
{
ebi::assembly_report::SynonymsMap synonyms_map;
std::string assembly_report_path = "test/input_files/assembly_report/full_assembly_report/passed_with_same_synonym_on_same_line.txt";
std::ifstream input{assembly_report_path};
CHECK_NOTHROW(synonyms_map.parse_assembly_report(input));
}

SECTION("Failing to parse assembly report due to incorrect column size")
{
ebi::assembly_report::SynonymsMap synonyms_map;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Assembly name: ASM710v1
# Organism name: Zymomonas mobilis subsp. mobilis ZM4 = ATCC 31821 (a-proteobacteria)
# Infraspecific name: strain=ZM4
# Taxid: 264203
# BioSample: SAMN02603579
# BioProject: PRJNA12354
# Submitter: Macrogen Inc.
# Date: 2010-01-12
# Assembly type: na
# Release type: major
# Assembly level: Complete Genome
# Genome representation: full
# GenBank assembly accession: GCA_000007105.1
# RefSeq assembly accession: GCF_000007105.1
# RefSeq assembly and GenBank assemblies identical: yes
#
## Assembly-Units:
## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name
## GCA_000007115.1 GCF_000007115.1 Primary Assembly
#
# Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by
# unlocalized scaffolds.
# Unplaced scaffolds are listed at the end.
# RefSeq is equal or derived from GenBank object.
#
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
AE008692.2 assembled-molecule na Chromosome AE008692.2 = NC_006526.2 Primary Assembly 2056363 na
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
CM000072.5 assembled-molecule 1 Chromosome CM000072.5 = NC_005100.4 Primary Assembly 282763074 1
CM000082.5 assembled-molecule 1 Chromosome CM000082.5 = NC_005300.4 Primary Assembly 282763044 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Sequence-Name Sequence-Role Assigned-Molecule Assigned-Molecule-Location/Type GenBank-Accn Relationship RefSeq-Accn Assembly-Unit Sequence-Length UCSC-style-name
CM000072.5 assembled-molecule 1 Chromosome CM000072.5 = NC_005100.4 Primary Assembly 282763074 1
chr2 assembled-molecule 1 Chromosome CM000082.5 = NC_005300.4 Primary Assembly 282763044 2

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CM000072.5 59940 56 60 61

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions test/vcf/assembly_checker_integration_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ namespace ebi
CHECK(ebi::vcf::assembly_checker::check_vcf_ref(vcf_input, vcf_path, fasta_path, assembly_report_path, outputs));
}

SECTION("Mapping contigs but contig has same synonym on the same line, all match")
{
auto folder = boost::filesystem::path("test/input_files/v4.3/assembly_checker/passed/passed_with_mapping_multi_synonyms/");
std::string file_prefix = folder.parent_path().filename().string();
std::string vcf_path = folder.string() + file_prefix + ebi::vcf::VCF_EXT;
std::ifstream vcf_input{vcf_path};
std::string fasta_path{folder.string() + file_prefix + ebi::vcf::FASTA_EXT};
std::string assembly_report_path = folder.string() + "assembly_report.txt";
CHECK(ebi::vcf::assembly_checker::check_vcf_ref(vcf_input, vcf_path, fasta_path, assembly_report_path, outputs));
}

SECTION("compressed VCF, gz compression")
{
Expand Down

0 comments on commit f134b4a

Please sign in to comment.