Skip to content

Commit

Permalink
update for CNV:TR and optional leading phasing info
Browse files Browse the repository at this point in the history
  • Loading branch information
vasudeva8 committed Jun 18, 2024
1 parent 004c7e5 commit 23e909d
Show file tree
Hide file tree
Showing 11 changed files with 863 additions and 32 deletions.
1 change: 1 addition & 0 deletions inc/util/string_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ namespace ebi
/**
* Splits `s` using `delims` as separator and fills the container `ret` with the parts.
* An empty string results in an empty container `ret`.
* Expects a string without leading separators and when one is present, it would be part of 1st string.
* @param s input string to split
* @param delims any character here acts as a separator
* @param ret return by reference the container filled with the string split.
Expand Down
15 changes: 14 additions & 1 deletion inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,15 @@ namespace ebi
{ SVTYPE, { STRING, "1" } },
{ VALIDATED, { FLAG, "0" } },
{ THOUSAND_G, { FLAG, "0" } },
{ SVCLAIM, { STRING, "A" } }
{ SVCLAIM, { STRING, "A" } },
{ RN, { INTEGER, "A" } },
{ RUS, { STRING, UNKNOWN_CARDINALITY } },
{ RUL, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ RB, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIRUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ CIRB, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUB, { INTEGER, UNKNOWN_CARDINALITY } }
};

const std::map<std::string, std::pair<std::string, std::string>> format_v41_v42 = {
Expand Down Expand Up @@ -724,6 +732,11 @@ namespace ebi
* @throw InfoBodyError
*/
void check_info_have_mandatory() const;

/**
* gets total RN count
*/
int getRNvalue() const;
};

std::ostream &operator<<(std::ostream &os, const Record &record);
Expand Down
2 changes: 2 additions & 0 deletions inc/vcf/optional_policy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ namespace ebi
void check_body_entry_info_svlen(ParsingState & state, Record const & record) const;
void check_body_entry_info_svclaim(ParsingState & state, Record const & record) const;
void check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const;
void check_body_entry_info_rb_ruc(ParsingState & state, Record const & record) const;
void check_body_entry_info_rul_rus(ParsingState & state, Record const & record) const;
void check_contig_meta(ParsingState & state, Record const & record) const;
void check_alternate_allele_meta(ParsingState & state, Record const & record) const;
void check_filter_meta(ParsingState & state, Record const & record) const;
Expand Down
8 changes: 8 additions & 0 deletions inc/vcf/string_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,14 @@ namespace ebi
const std::string VALIDATED = "VALIDATED";
const std::string THOUSAND_G = "1000G";
const std::string SVCLAIM = "SVCLAIM";
const std::string RN = "RN";
const std::string RUS = "RUS";
const std::string RUL = "RUL";
const std::string RUC = "RUC";
const std::string RB = "RB";
const std::string CIRUC = "CIRUC";
const std::string CIRB = "CIRB";
const std::string RUB = "RUB";

// FORMAT predefined tags
const std::string AHAP = "AHAP";
Expand Down
258 changes: 238 additions & 20 deletions src/vcf/record.cpp

Large diffs are not rendered by default.

87 changes: 79 additions & 8 deletions src/vcf/validate_optional_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ namespace ebi
// SVCLAIM check
check_body_entry_info_svclaim(state, record);

// RB RUC check
check_body_entry_info_rb_ruc(state, record);

// RUL RUS check
check_body_entry_info_rul_rus(state, record);

// Confidence interval tags should have first value <=0 and second value >= 0
check_body_entry_info_confidence_interval(state, record);

Expand Down Expand Up @@ -225,7 +231,7 @@ namespace ebi
std::vector<std::string> values;

if (record.source->version < Version::v44) {
return; //svclaim not present for version < v43
return; //svclaim not present for version < v44
}
auto it = record.info.find(SVCLAIM);
if (it == record.info.end()) {
Expand All @@ -251,24 +257,89 @@ namespace ebi

void ValidateOptionalPolicy::check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const
{
std::vector<std::string> confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS };
std::vector<std::string> confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS, CIRB, CIRUC };
for (auto & confidence_interval_tag : confidence_interval_tags) {
auto it = record.info.find(confidence_interval_tag);
if (it != record.info.end()) {
std::vector<std::string> values;
util::string_split(it->second, ",", values);
size_t scanned_first_value_length, scanned_second_value_length;
int first_numeric_value = std::stoi(values[0], &scanned_first_value_length);
int second_numeric_value = std::stoi(values[1], &scanned_second_value_length);
if (first_numeric_value > 0 || second_numeric_value < 0
|| values[0].size() != scanned_first_value_length || values[1].size() != scanned_second_value_length) {
if (values.size() % 2 != 0) { //CI should have even count
throw new InfoBodyError{state.n_lines,
"INFO " + confidence_interval_tag +
" is a confidence interval tag, which should have first value <= 0 and second value >= 0"};
" is a confidence interval tag, which should have even number entries"};
}
for (int i = 0; i < values.size(); i += 2) {
size_t scanned_first_value_length = 1, scanned_second_value_length = 1;
//considers missing value as 0 - valid value
int first_numeric_value = std::stoi(values[i] != MISSING_VALUE ? values[i] : "0", &scanned_first_value_length);
int second_numeric_value = std::stoi(values[i + 1] != MISSING_VALUE ? values[i + 1] : "0", &scanned_second_value_length);
if (first_numeric_value > 0 || second_numeric_value < 0
|| values[i].size() != scanned_first_value_length || values[i + 1].size() != scanned_second_value_length) {
throw new InfoBodyError{state.n_lines,
"INFO " + confidence_interval_tag +
" is a confidence interval tag, which should have first value <= 0 and second value >= 0"};
}
}
}
}
}

void ValidateOptionalPolicy::check_body_entry_info_rb_ruc(ParsingState & state, Record const & record) const
{
std::vector<std::string> valRB, valRUC, valLen;
int rb = 0, ruc = 0 , rul = 0;
const float limit = 0.05; //5% variation

if (record.source->version < Version::v44) {
return; //not valid for version < v44
}
auto itRB = record.info.find(RB);
auto itRUC = record.info.find(RUC);
auto itRUL = record.info.find(RUL);
auto itRUS = record.info.find(RUS);
if (itRB == record.info.end() || itRUC == record.info.end()) {
return; //nothing to check
}
util::string_split(itRB->second, ",", valRB);
util::string_split(itRUC->second, ",", valRUC);
if (itRUL != record.info.end()) {
util::string_split(itRUL->second, ",", valLen);
} else {
util::string_split(itRUS->second, ",", valLen);
}
if (valRB.size() != valRUC.size() || valRB.size() != valLen.size()) {
return; //already checked in records
}

for (size_t i = 0; i < valRB.size(); ++i) {
if (valRB[i] == MISSING_VALUE) {
continue;
}
rb = std::stoi(valRB[i]);
ruc = std::stoi(valRUC[i]);
rul = itRUL != record.info.end()? std::stoi(valLen[i]) : valLen[i].size();
//RB ~= RUL * RUC
if ( (abs(rb - rul * ruc) / (float)rb) > limit) {
std::stringstream message;
message << "INFO " << "RB should be approximately RUC * unit_length";
throw new InfoBodyError{record.line, message.str(), "Failed for position " + std::to_string(i)};
}
}
}

void ValidateOptionalPolicy::check_body_entry_info_rul_rus(ParsingState & state, Record const & record) const
{
if (record.source->version < Version::v44) {
return; //not valid for version < v44
}
auto itRUL = record.info.find(RUL);
auto itRUS = record.info.find(RUS);
if (itRUS != record.info.end() && itRUL != record.info.end()) { //RUS, RUL together - redundant info
std::stringstream message;
message << "INFO " << "RUS and RUL present together, RUL can be avoided";
throw new InfoBodyError{record.line, message.str()};
}
}

void ValidateOptionalPolicy::check_contig_meta(ParsingState & state, Record const & record) const
{
Expand Down
2 changes: 1 addition & 1 deletion test/input_files/v4.4/passed/passed_body_format.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
1 400 rs182711216 C T 100 PASS AC=4 GT:G_S:GL 0|0:0.000:-0.18,-0.48,-2.49 0|0:0.000:-0.20,-0.44,-2.06
1 500 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL 0|0:0.000:-0.18,-0.48,-2.49 0|0:0.000:-0.20,-0.44,-2.06
1 600 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL |0|0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06
1 700 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL \0\0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06
1 700 rs182711216 C T 100 PASS AC=4 GT:G%3AS:GL /0/0:0.000:-0.18,-0.48,-2.49 |0|0:0.000:-0.20,-0.44,-2.06
131 changes: 130 additions & 1 deletion test/vcf/metaentry_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3234,7 +3234,136 @@ namespace ebi
source
}),
vcf::MetaSectionError* );
}

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::A}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid number
1,
vcf::INFO,
{ {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::R}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} },
source
}),
vcf::MetaSectionError* );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RN}, {vcf::NUMBER, vcf::A}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Total num. of repeat seq. in this allele."} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUS}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::STRING}, {vcf::DESCRIPTION, "A Repeat unit sequence"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUS}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "A Repeat unit sequence"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUL}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Length of repeating unit"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUL}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Length of repeating unit"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Count of repeating unit"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Count of repeating unit"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Total num. of bases in repeat seq."} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Total num. of bases in repeat seq."} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Confidence interval for RUC"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRUC}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Confidence interval for RUC"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Confidence interval for RB"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::CIRB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Confidence interval for RB"} },
source
}),
vcf::MetaSectionError* );

CHECK_NOTHROW( (vcf::MetaEntry { //valid definition
1,
vcf::INFO,
{ {vcf::ID, vcf::RUB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::INTEGER}, {vcf::DESCRIPTION, "Number of bases in repeat unit"} },
source
} ) );

CHECK_THROWS_AS( (vcf::MetaEntry { //invalid type
1,
vcf::INFO,
{ {vcf::ID, vcf::RUB}, {vcf::NUMBER, vcf::UNKNOWN_CARDINALITY}, {vcf::TYPE, vcf::FLOAT}, {vcf::DESCRIPTION, "Number of bases in repeat unit"} },
source
}),
vcf::MetaSectionError* );

}
}

TEST_CASE("SAMPLE MetaEntry checks", "[checks][keyvalue]")
Expand Down
Loading

0 comments on commit 23e909d

Please sign in to comment.