diff --git a/src/svaba/AlignmentFragment.cpp b/src/svaba/AlignmentFragment.cpp index 83290ea..dfae579 100644 --- a/src/svaba/AlignmentFragment.cpp +++ b/src/svaba/AlignmentFragment.cpp @@ -28,7 +28,7 @@ AlignmentFragment::AlignmentFragment(const SeqLib::BamRecord &talign, bool flip) m_align = talign; - talign.GetIntTag("SQ", sub_n); + assert(talign.GetIntTag("SQ", sub_n)); // orient cigar so it is on the contig orientation. // need to do this to get the right ordering of the contig fragments below diff --git a/src/svaba/BreakPoint.cpp b/src/svaba/BreakPoint.cpp index 5fbcc25..4f951bd 100644 --- a/src/svaba/BreakPoint.cpp +++ b/src/svaba/BreakPoint.cpp @@ -9,6 +9,9 @@ #include "svaba_params.h" +// n is the max integer given the int size (e.g. 255). x is string with int +#define INTNSTOI(x,n) std::min((int)n, std::stoi(x)); + // define repeats static std::vector repr = {"AAAAAAAAAAAAAAAA", "TTTTTTTTTTTTTTTT", "CCCCCCCCCCCCCCCC", "GGGGGGGGGGGGGGGG", @@ -63,13 +66,14 @@ using namespace SeqLib; for (auto& s : allele) max_lod = std::max(max_lod, s.second.LO); - ss << b1.chr_name << sep << b1.gr.pos1 << sep << b1.gr.strand << sep - << b2.chr_name << sep << b2.gr.pos1 << sep << b2.gr.strand << sep - << ref << sep << alt << sep - << getSpan() << sep - << b1.mapq << sep << b2.mapq << sep - << b1.nm << sep << b2.nm << sep - << dc.mapq1 << sep << dc.mapq2 << sep + ss << b1.chr_name << sep << b1.gr.pos1 << sep << b1.gr.strand << sep //1-3 + << b2.chr_name << sep << b2.gr.pos1 << sep << b2.gr.strand << sep //4-6 + << ref << sep << alt << sep //7-8 + << getSpan() << sep //9 + << b1.mapq << sep << b2.mapq << sep //10-11 + << b1.nm << sep << b2.nm << sep //12-13 + << dc.mapq1 << sep << dc.mapq2 << sep //14-15 + << a.split << sep << a.cigar << sep << a.alt << sep << a.cov << sep // ALL NEW 9/2018 //<< dc.ncount << sep << dc.tcount << sep << b1.sub_n << sep << b2.sub_n << sep << (homology.length() ? homology : "x") << sep @@ -259,25 +263,29 @@ BreakPoint::BreakPoint(const std::string &line, const SeqLib::BamHeader& h) { //case 15: dc.tcount = std::stoi(val); break; case 14: dc.mapq1 = std::stoi(val); break; case 15: dc.mapq2 = std::stoi(val); break; - case 16: b1.sub_n = std::stoi(val); break; - case 17: b2.sub_n = std::stoi(val); break; - case 18: homology = (val == "x" ? "" : val); break; - case 19: insertion = (val == "x" ? "" : val); break; - case 20: cname = val; break; - case 21: num_align = std::stoi(val); break; - case 22: confidence = val; break; - case 23: evidence = val; break; - case 24: quality = std::stoi(val); break; - case 25: secondary = val == "1"; - case 26: somatic_score = std::stod(val); break; - case 27: somatic_lod = std::stod(val); break; - case 28: a.LO = std::stod(val); break; - case 29: pon = std::stoi(val); break; - case 30: repeat_seq = val; break; - case 31: blacklist = (val=="1"); break; - case 32: rs = val; break; - case 33: read_names = val; break; - case 34: bxtable = val; break; + case 16: a.split = std::stoi(val); break; + case 17: a.cigar = std::stoi(val); break; + case 18: a.alt = std::stoi(val); break; + case 19: a.cov = std::stoi(val); break; + case 20: b1.sub_n = std::stoi(val); break; + case 21: b2.sub_n = std::stoi(val); break; + case 22: homology = (val == "x" ? "" : val); break; + case 23: insertion = (val == "x" ? "" : val); break; + case 24: cname = val; break; + case 25: num_align = std::stoi(val); break; + case 26: confidence = val; break; + case 27: evidence = val; break; + case 28: quality = std::stoi(val); break; + case 29: secondary = val == "1"; + case 30: somatic_score = std::stod(val); break; + case 31: somatic_lod = std::stod(val); break; + case 32: a.LO = std::stod(val); break; + case 33: pon = std::stoi(val); break; + case 34: repeat_seq = val; break; + case 35: blacklist = (val=="1"); break; + case 36: rs = val; break; + case 37: read_names = val; break; + case 38: bxtable = val; break; default: aaa.indel = evidence == "INDEL"; aaa.fromString(val); @@ -1238,46 +1246,48 @@ ReducedBreakPoint::ReducedBreakPoint(const std::string &line, const SeqLib::BamH alt_s = val; break; case 9: break; //span = stoi(val); break; // automatically calculated - case 10: + case 10: //mapq1 b1 = ReducedBreakEnd(GenomicRegion(chr1, pos1, pos1, h), std::stoi(val), chr_name1); b1.gr.strand = strand1; break; - case 11: + case 11: //mapq2 b2 = ReducedBreakEnd(GenomicRegion(chr2, pos2, pos2, h), std::stoi(val), chr_name2); b2.gr.strand = strand2; break; - case 12: b1.nm = std::stoi(val); break; - case 13: b2.nm = std::stoi(val); break; - case 16: b1.sub_n = std::min((int)255, std::stoi(val)); break; - case 17: b2.sub_n = std::min((int)255, std::stoi(val)); break; - //case 14: dc.ncount = std::min((int)255, std::stoi(val)); break; - //case 15: dc.tcount = std::min((int)255,std::stoi(val)); break; - case 14: dc.mapq1 = std::stoi(val); break; - case 15: dc.mapq2 = std::stoi(val); break; - case 18: + case 12: b1.nm = INTNSTOI(val,255); break; + case 13: b2.nm = INTNSTOI(val,255); break; + case 14: dc.mapq1 = INTNSTOI(val, 255); break; + case 15: dc.mapq2 = INTNSTOI(val, 255); break; + //case 16: split (not needed, since in genotype) + //case 17: cigar (not needed, since in genotype) + //case 18: alt (not needed, since in genotype) + case 19: cov = INTNSTOI(val,65535); break; + case 20: b1.sub_n = INTNSTOI(val,255); break; + case 21: b2.sub_n = INTNSTOI(val,255); break; + case 22: homology_s = val; break; - case 19: + case 23: insertion_s = val; break; - case 20: cname_s = val; break; - case 21: num_align = std::min((int)31, std::stoi(val)); break; - case 22: + case 24: cname_s = val; break; + case 25: num_align = std::min((int)31, std::stoi(val)); break; + case 26: pass = val == "PASS"; confidence_s = val; break; - case 23: + case 27: evidence_s = val; indel = val == "INDEL"; imprecise = val == "DSCRD"; break; - case 24: quality = std::stod(val); break; //std::min((int)255,std::stoi(val)); break; - case 25: secondary = val == "1" ? 1 : 0; - case 26: somatic_score = std::stod(val); break; - case 27: somatic_lod = std::stod(val); break; - case 28: true_lod = std::stod(val); break; - case 29: pon = std::min(255,std::stoi(val)); break; - case 30: repeat_s = val; break; // repeat_seq - case 31: blacklist = (val=="1" ? 1 : 0); break; - case 32: dbsnp = val != "x"; break; - case 33: read_names_s = val; break; //reads - case 34: bxtable_s = val; break; //bx tags + case 28: quality = std::stod(val); break; //std::min((int)255,std::stoi(val)); break; + case 29: secondary = val == "1" ? 1 : 0; + case 30: somatic_score = std::stod(val); break; + case 31: somatic_lod = std::stod(val); break; + case 32: true_lod = std::stod(val); break; + case 33: pon = std::min(255,std::stoi(val)); break; + case 34: repeat_s = val; break; // repeat_seq + case 35: blacklist = (val=="1" ? 1 : 0); break; + case 36: dbsnp = val != "x"; break; + case 37: read_names_s = val; break; //reads + case 38: bxtable_s = val; break; //bx tags default: format_s.push_back(val); } @@ -1672,25 +1682,30 @@ bool ReducedBreakPoint::operator<(const ReducedBreakPoint& bp) const { else if (std::strcmp(evidence, bp.evidence) > 0) // > return false; - if (nsplit > bp.nsplit) + if (cov > bp.cov) return true; - else if (nsplit < bp.nsplit) + else if (cov < bp.cov) return false; + + //if (nsplit > bp.nsplit) + // return true; + //else if (nsplit < bp.nsplit) + // return false; - if (tsplit > bp.tsplit) - return true; - else if (tsplit < bp.tsplit) - return false; + //if (tsplit > bp.tsplit) + // return true; + //else if (tsplit < bp.tsplit) + // return false; - if (dc.ncount > bp.dc.ncount) - return true; - else if (dc.ncount < bp.dc.ncount) - return false; + //if (dc.ncount > bp.dc.ncount) + // return true; + //else if (dc.ncount < bp.dc.ncount) + // return false; - if (dc.tcount > bp.dc.tcount) - return true; - else if (dc.tcount < bp.dc.tcount) - return false; + //if (dc.tcount > bp.dc.tcount) + // return true; + //else if (dc.tcount < bp.dc.tcount) + // return false; // break the tie somehow if (cname > bp.cname) diff --git a/src/svaba/BreakPoint.h b/src/svaba/BreakPoint.h index cda578f..76e07b8 100644 --- a/src/svaba/BreakPoint.h +++ b/src/svaba/BreakPoint.h @@ -137,8 +137,9 @@ struct ReducedBreakEnd { double somatic_lod = 0; // LogOdds that variant not in normal double true_lod = 0; - uint32_t nsplit:8, tsplit:8, af_n:7, num_align:5, secondary:1, dbsnp:1, pass:1, blacklist:1, indel:1, imprecise:1; - uint32_t tcov_support:8, ncov_support:8, tcov:8, ncov:8; + //uint32_t nsplit:8, tsplit:8, + //uint32_t tcov_support:8, ncov_support:8, tcov:8, ncov:8; + uint32_t cov:16, af_n:7, num_align:5, secondary:1, dbsnp:1, pass:1, blacklist:1, indel:1, imprecise:1; uint32_t tcigar:8, ncigar:8, dummy:8, af_t:8; float quality; uint8_t pon; @@ -196,7 +197,7 @@ struct ReducedBreakEnd { struct BreakPoint { static std::string header() { - return "chr1\tpos1\tstrand1\tchr2\tpos2\tstrand2\tref\talt\tspan\tmapq1\tmapq2\tnm1\tnm2\tdisc_mapq1\tdisc_mapq2\tsub_n1\tsub_n2\thomology\tinsertion\tcontig\tnumalign\tconfidence\tevidence\tquality\tsecondary_alignment\tsomatic_score\tsomatic_lod\ttrue_lod\tpon_samples\trepeat_seq\tgraylist\tDBSNP\treads\tbxtags"; + return "chr1\tpos1\tstrand1\tchr2\tpos2\tstrand2\tref\talt\tspan\tmapq1\tmapq2\tnm1\tnm2\tdisc_mapq1\tdisc_mapq2\tsplit\tcigar\talt\tcov\tsub_n1\tsub_n2\thomology\tinsertion\tcontig\tnumalign\tconfidence\tevidence\tquality\tsecondary_alignment\tsomatic_score\tsomatic_lod\ttlod\tpon_samples\trepeat_seq\tgraylist\tDBSNP\treads\tbxtags"; } double somatic_score = 0; diff --git a/src/svaba/DiscordantCluster.cpp b/src/svaba/DiscordantCluster.cpp index c7d188c..fc54506 100644 --- a/src/svaba/DiscordantCluster.cpp +++ b/src/svaba/DiscordantCluster.cpp @@ -333,7 +333,7 @@ using namespace SeqLib; mapq1 = __getMeanMapq(false); mapq2 = __getMeanMapq(true); assert(mapq1 >= 0); - assert(mapq2 >= -1); // can have -1 as placeholder if no mate reads (bc didnt do lookup)x + assert(mapq2 >= -1); // can have -1 as placeholder if no mate reads (bc didnt do lookup) // orient them correctly so that left end is first if (m_reg2 < m_reg1) { @@ -394,7 +394,7 @@ using namespace SeqLib; return -1; if (tmapq.size() > 0) - mean = std::accumulate(tmapq.begin(), tmapq.end(), 0.0) / tmapq.size(); + mean = ((double)std::accumulate(tmapq.begin(), tmapq.end(), 0.0)) / ((double)tmapq.size()); // actually, get the median //if (tmapq.size() > 0) diff --git a/src/svaba/run_svaba.cpp b/src/svaba/run_svaba.cpp index cdb0296..5918e99 100644 --- a/src/svaba/run_svaba.cpp +++ b/src/svaba/run_svaba.cpp @@ -70,7 +70,7 @@ static struct timespec start; // learned value static int max_mapq_possible; -static std::string args = "svaba "; // hold string of what the input args were +static std::string args = "svaba"; // hold string of what the input args were static int32_t readlen = 0; namespace opt { @@ -122,6 +122,7 @@ namespace opt { static size_t mate_region_lookup_limit = 400; static bool interchrom_lookup = true; static int32_t max_reads_per_assembly = -1; // set default of 50000 in parseRunOptions + static bool no_bad_avoid = true; // if true, don't avoid previously bad regions // additional optional params static int chunk = 25000; @@ -646,6 +647,7 @@ void runsvaba(int argc, char** argv) { os_discordant << DiscordantCluster::header() << std::endl; // put args into string for VCF later + args += "(v" + std::string(SVABA_VERSION) + ") "; for (int i = 0; i < argc; ++i) args += std::string(argv[i]) + " "; @@ -930,7 +932,6 @@ bool runWorkItem(const SeqLib::GenomicRegion& region, svabaThreadUnit& wu, long wu.badd.Concat(w.second.readBam(&log_file)); wu.badd.MergeOverlappingIntervals(); wu.badd.CreateTreeMap(); - // adjust the counts if (w.first.at(0) == 't') { @@ -1475,7 +1476,7 @@ CountPair run_mate_collection_loop(const SeqLib::GenomicRegion& region, WalkerMa for (auto& s : tmp_somatic_mate_regions) { // check if its not bad from mate region - if (badd.size()) + if (badd.size() && !opt::no_bad_avoid) if (badd.CountOverlaps(s)) continue; @@ -1847,7 +1848,7 @@ void collect_and_clear_reads(WalkerMap& walkers, svabaReadVector& brv, std::vect dedupe.insert(sr); } } - + // concat together all of the learning sequences if (opt::ec_correct_type != "s") assert(!w.second.all_seqs.size()); @@ -1855,7 +1856,7 @@ void collect_and_clear_reads(WalkerMap& walkers, svabaReadVector& brv, std::vect learn_seqs.push_back(strdup(r)); free(r); // free what was alloced in } - + w.second.all_seqs.clear(); w.second.reads.clear(); } diff --git a/src/svaba/run_svaba.h b/src/svaba/run_svaba.h index 6ba1e30..a27d4c9 100644 --- a/src/svaba/run_svaba.h +++ b/src/svaba/run_svaba.h @@ -24,6 +24,8 @@ #include "workqueue.h" +#define SVABA_VERSION "1.1.0" + // typedefs typedef std::map BamMap; typedef std::pair CountPair; diff --git a/src/svaba/svaba.cpp b/src/svaba/svaba.cpp index bb7b1a9..5be0acc 100644 --- a/src/svaba/svaba.cpp +++ b/src/svaba/svaba.cpp @@ -12,14 +12,14 @@ #include "refilter.h" #include "run_svaba.h" -#define AUTHOR "Jeremiah Wala " +#define AUTHOR "Jeremiah Wala [options]\n\n" "Commands:\n" diff --git a/src/svaba/svabaBamWalker.cpp b/src/svaba/svabaBamWalker.cpp index 9d56cd8..9276ad3 100644 --- a/src/svaba/svabaBamWalker.cpp +++ b/src/svaba/svabaBamWalker.cpp @@ -68,7 +68,6 @@ SeqLib::GRC svabaBamWalker::readBam(std::ofstream * log) { // keep track of regions where we hit the weird-read limit SeqLib::GRC bad_regions; - // buffer to store reads from a region // if we don't hit the limit, then add them to be assembled svabaReadVector this_reads; @@ -135,7 +134,6 @@ SeqLib::GRC svabaBamWalker::readBam(std::ofstream * log) { << " weird reads. Limit: " << SeqLib::AddCommas(m_limit) << std::endl; if (log) (*log) << ss.str(); - std::cerr << ss.str(); if (m_region.size()) bad_regions.add(m_region[tb->m_region_idx]); diff --git a/src/svaba/svabaUtils.cpp b/src/svaba/svabaUtils.cpp index 23a129f..517b678 100644 --- a/src/svaba/svabaUtils.cpp +++ b/src/svaba/svabaUtils.cpp @@ -19,13 +19,11 @@ static std::string POLYTC = "TCTCTCTCTCTCTCTCTCTCTCTC"; // set the time string time_t t = time(0); // get time now struct tm * now = localtime( & t ); - std::stringstream month; std::stringstream mdate; - if ( (now->tm_mon+1) < 10) - month << "0" << now->tm_mon+1; - else - month << now->tm_mon+1; - mdate << (now->tm_year + 1900) << month.str() << now->tm_mday; + mdate << (now->tm_year + 1900) + << (now->tm_mon + 1 < 10 ? "0": "") << (now->tm_mon + 1) + << (now->tm_mday < 10 ? "0" : "") + << now->tm_mday; return mdate.str(); } diff --git a/src/svaba/vcf.cpp b/src/svaba/vcf.cpp index 11d57f5..38660b2 100644 --- a/src/svaba/vcf.cpp +++ b/src/svaba/vcf.cpp @@ -10,6 +10,7 @@ #include "htslib/tbx.h" #include "htslib/bgzf.h" +#include "htslib/khash.h" #include "gzstream.h" #include "SeqLib/GenomicRegionCollection.h" @@ -21,11 +22,13 @@ using namespace std; static std::string sv_format = "GT:AD:DP:GQ:PL:SR:DR:LR:LO"; static std::string indel_format = "GT:AD:DP:GQ:PL:SR:CR:LR:LO"; static InfoMap flag_map; -static int global_id = 0; +//static int global_id = 0; static std::stringstream lod_ss; static std::unordered_map cname_count; +static std::unordered_set hash_avoid; // being extra careful for hash collisions + void __write_to_zip_vcf(const VCFEntry& v, BGZF * f) { std::stringstream ss; ss << v << endl; @@ -692,9 +695,26 @@ VCFEntryPair::VCFEntryPair(std::shared_ptr& b) { bp = b; e1.bp = bp; e2.bp = bp; - ++global_id; - e1.id = global_id; - e2.id = global_id; + uint32_t hashed; + int i = 0; + do { + // come up with some string that describes this, to be hashed (doesn't have to be human readable) + std::string s = std::to_string(bp->b1.gr.pos1) + "-" + + std::to_string(bp->b2.gr.pos1) + "-" + + bp->b1.chr_name + "_" + bp->b2.chr_name + std::string(b->cname) + + //"-" + std::to_string(b->cov) + "-" + std::to_string(b->af_n) + "-" + std::to_string(b->tcigar) + + //"-" + std::to_string(b->af_t) + std::to_string(b->quality) + + std::to_string(i); + hashed = __ac_Wang_hash(__ac_X31_hash_string(s.c_str())); + ++i; + //std::cout << "size " << hash_avoid.size() << " HI " << hashed << " " << (hash_avoid.find(hashed) != hash_avoid.end()) << " " << bp->b1.gr << " " << bp->b2.gr << std::endl; + } while (hash_avoid.find(hashed) != hash_avoid.end()); + + hash_avoid.insert(hashed); + + //++global_id; + e1.id = hashed; //global_id; + e2.id = hashed; //global_id; e1.id_num = 1; e2.id_num = 2; @@ -903,7 +923,7 @@ std::string VCFEntry::getIdString() const { } -std::pair VCFEntry::getSampStrings() const { +/*std::pair VCFEntry::getSampStrings() const { // put the reads into the format string //std::string new_readid_t = formatReadString(b.read_names, 't'); @@ -927,7 +947,7 @@ std::pair VCFEntry::getSampStrings() const { return std::pair(samp1, samp2); -} + }*/ void VCFHeader::addContigField(std::string id, int len) { contigfieldmap[id] = std::to_string(len); diff --git a/src/svaba/vcf.h b/src/svaba/vcf.h index 097b6d3..bcad126 100644 --- a/src/svaba/vcf.h +++ b/src/svaba/vcf.h @@ -50,9 +50,6 @@ struct VCFHeader { // output it to a string friend std::ostream& operator<<(std::ostream& out, const VCFHeader& v); - //set the filedate string to the current date - void setCurrentDate(); - //add an info field void addInfoField(std::string field, std::string number, std::string type, std::string description);