From 5d3214a3a20a24030c53b7df40ce1ba996771951 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 20 Oct 2023 08:47:23 -0700 Subject: [PATCH] Accommodate '@' as a valid character in the quality strings. This requires us to use a length-based approach to finding the end of a quality string, given that '@' is not a reliable terminator. --- CMakeLists.txt | 2 +- include/kaori/FastqReader.hpp | 34 ++++++++-------- tests/src/FastqReader.cpp | 73 +++++++++++++++++++++++++++++------ 3 files changed, 80 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e292b8..378ae73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.24) project(kaori - VERSION 1.1.0 + VERSION 1.1.1 DESCRIPTION "Header-only C++ library for screen counting" LANGUAGES CXX) diff --git a/include/kaori/FastqReader.hpp b/include/kaori/FastqReader.hpp index 7bad472..a97089a 100644 --- a/include/kaori/FastqReader.hpp +++ b/include/kaori/FastqReader.hpp @@ -29,13 +29,7 @@ class FastqReader { FastqReader(byteme::Reader* p) : pb(p) { sequence.reserve(200); name.reserve(200); - okay = pb.valid(); - if (okay) { - if (pb.get() != '@') { - throw std::runtime_error("first line containing FASTQ name should start with '@'"); - } - } } /** @@ -55,7 +49,12 @@ class FastqReader { // Processing the name. This should be on a single line, hopefully. name.clear(); - char val = advance_and_check(); + char val = pb.get(); + if (val != '@') { + throw std::runtime_error("read name should start with '@' (starting line " + std::to_string(init_line + 1) + ")"); + } + + val = advance_and_check(); while (!std::isspace(val)) { name.push_back(val); val = advance_and_check(); @@ -84,26 +83,29 @@ class FastqReader { } ++line_count; - // Processing the qualities. Extraction is allowed to fail if we're at the - // end of the file. - size_t qual_length = 0; + // Processing the qualities. Extraction is allowed to fail if we're at + // the end of the file. Note that we can't check for '@' as a + // delimitor, as this can be a valid score, so instead we check at each + // newline whether we've reached the specified length, and quit if so. + size_t qual_length = 0, seq_length = sequence.size(); okay = false; + while (pb.advance()) { val = pb.get(); - if (val == '@') { - okay = true; - break; - } if (val != '\n') { ++qual_length; + } else if (qual_length >= seq_length) { + okay = pb.advance(); // sneak past the newline. + break; } } - ++line_count; - if (qual_length != sequence.size()) { + if (qual_length != seq_length) { throw std::runtime_error("non-equal lengths for quality and sequence strings (starting line " + std::to_string(init_line + 1) + ")"); } + ++line_count; + return true; } diff --git a/tests/src/FastqReader.cpp b/tests/src/FastqReader.cpp index 6537b38..2768734 100644 --- a/tests/src/FastqReader.cpp +++ b/tests/src/FastqReader.cpp @@ -51,29 +51,42 @@ TEST(BasicTests, MultipleEntries) { } TEST(BasicTests, MultiLineEntries) { - std::string buffer = "@FOO\nA\nCG\nTGCA\n+\n!!\n!!\n!!!"; - byteme::RawBufferReader reader(reinterpret_cast(buffer.c_str()), buffer.size()); - kaori::FastqReader fq(&reader); + for (size_t i = 0; i < 2; ++i) { + std::string buffer = "@FOO\nA\nCG\nTGCA\n+\n!!\n!!\n!!!\n@ARG\nACACGGT\nC\n+\n@@@\n@\n@@@@"; + if (i == 2) { + buffer += '\n'; + } - EXPECT_TRUE(fq()); - const auto& name = fq.get_name(); - EXPECT_EQ(std::string(name.begin(), name.end()), "FOO"); - const auto& seq = fq.get_sequence(); - EXPECT_EQ(std::string(seq.begin(), seq.end()), "ACGTGCA"); + byteme::RawBufferReader reader(reinterpret_cast(buffer.c_str()), buffer.size()); + kaori::FastqReader fq(&reader); - EXPECT_FALSE(fq()); + EXPECT_TRUE(fq()); + const auto& name = fq.get_name(); + EXPECT_EQ(std::string(name.begin(), name.end()), "FOO"); + const auto& seq = fq.get_sequence(); + EXPECT_EQ(std::string(seq.begin(), seq.end()), "ACGTGCA"); + + EXPECT_TRUE(fq()); + const auto& name2 = fq.get_name(); + EXPECT_EQ(std::string(name2.begin(), name2.end()), "ARG"); + const auto& seq2 = fq.get_sequence(); + EXPECT_EQ(std::string(seq2.begin(), seq2.end()), "ACACGGTC"); + + EXPECT_FALSE(fq()); + } } TEST(BasicTests, Errors) { { std::string buffer = "FOO"; byteme::RawBufferReader reader(reinterpret_cast(buffer.c_str()), buffer.size()); + kaori::FastqReader fq(&reader); EXPECT_ANY_THROW({ try { - kaori::FastqReader fq(&reader); + fq(); } catch (std::exception& e) { - EXPECT_TRUE(std::string(e.what()).find("first line") != std::string::npos); + EXPECT_TRUE(std::string(e.what()).find("read name should start") != std::string::npos); throw e; } }); @@ -95,7 +108,25 @@ TEST(BasicTests, Errors) { } { - std::string buffer = "@FOO\nAC\n+\n!!\n@WHEE\nACGT\n+\n!!"; + std::string buffer = "@FOO\nAC\n+\n!!\n@WHEE\nACGT\n+\n!!"; // too short. + byteme::RawBufferReader reader(reinterpret_cast(buffer.c_str()), buffer.size()); + kaori::FastqReader fq(&reader); + fq(); + + EXPECT_ANY_THROW({ + try { + fq(); + } catch (std::exception& e) { + std::string msg(e.what()); + EXPECT_TRUE(msg.find("non-equal lengths") != std::string::npos); + EXPECT_TRUE(msg.find("line 5") != std::string::npos); + throw e; + } + }); + } + + { + std::string buffer = "@FOO\nAC\n+\n!!\n@WHEE\nACGT\n+\n!!!@@!!@\n"; // too long. byteme::RawBufferReader reader(reinterpret_cast(buffer.c_str()), buffer.size()); kaori::FastqReader fq(&reader); fq(); @@ -111,6 +142,24 @@ TEST(BasicTests, Errors) { } }); } + + { + std::string buffer = "@FOO\nAC\n+\n!!\nWHEE"; + byteme::RawBufferReader reader(reinterpret_cast(buffer.c_str()), buffer.size()); + kaori::FastqReader fq(&reader); + fq(); + + EXPECT_ANY_THROW({ + try { + fq(); + } catch (std::exception& e) { + std::string msg(e.what()); + EXPECT_TRUE(msg.find("should start") != std::string::npos); + EXPECT_TRUE(msg.find("line 5") != std::string::npos); + throw e; + } + }); + } } class FastqReaderFileTest : public testing::TestWithParam {};