diff --git a/nbproject/build-impl.xml b/nbproject/build-impl.xml index f09872d..d88f591 100644 --- a/nbproject/build-impl.xml +++ b/nbproject/build-impl.xml @@ -71,8 +71,8 @@ is divided into following sections: - - + + @@ -101,7 +101,7 @@ is divided into following sections: - + @@ -181,6 +181,7 @@ is divided into following sections: + @@ -216,7 +217,12 @@ is divided into following sections: - + + + + + + @@ -242,6 +248,7 @@ is divided into following sections: + @@ -718,7 +725,7 @@ is divided into following sections: - + @@ -793,7 +800,7 @@ is divided into following sections: - + @@ -820,7 +827,7 @@ is divided into following sections: - + @@ -859,7 +866,7 @@ is divided into following sections: - + @@ -871,7 +878,7 @@ is divided into following sections: - + @@ -994,15 +1001,15 @@ is divided into following sections: - + - + - + @@ -1010,7 +1017,7 @@ is divided into following sections: - + @@ -1205,7 +1212,7 @@ is divided into following sections: Must select one file in the IDE or set run.class - + Must select one file in the IDE or set applet.url diff --git a/nbproject/genfiles.properties b/nbproject/genfiles.properties index e5b83ba..1293f1b 100644 --- a/nbproject/genfiles.properties +++ b/nbproject/genfiles.properties @@ -4,5 +4,5 @@ build.xml.stylesheet.CRC32=8064a381@1.75.2.48 # This file is used by a NetBeans-based IDE to track changes in generated files such as build-impl.xml. # Do not edit this file. You may delete it but then the IDE will never regenerate such files for you. nbproject/build-impl.xml.data.CRC32=91437f43 -nbproject/build-impl.xml.script.CRC32=097bff4d -nbproject/build-impl.xml.stylesheet.CRC32=876e7a8f@1.75.2.48 +nbproject/build-impl.xml.script.CRC32=9ddad943 +nbproject/build-impl.xml.stylesheet.CRC32=830a3534@1.80.1.48 diff --git a/src/libra/common/hadoop/io/reader/sequence/RawReadReader.java b/src/libra/common/hadoop/io/reader/sequence/RawReadReader.java index e362a7b..f3d5696 100644 --- a/src/libra/common/hadoop/io/reader/sequence/RawReadReader.java +++ b/src/libra/common/hadoop/io/reader/sequence/RawReadReader.java @@ -35,110 +35,286 @@ public class RawReadReader implements Closeable { private static final Log LOG = LogFactory.getLog(RawReadReader.class); + private static final int LINE_BUFFERS = 4; + private SampleFormat format; private LineReader in; - private Text buffer = new Text(); - private int bufferConsumed; - private char delimiter; + private Text[] buffers = new Text[LINE_BUFFERS]; + private int[] bufferConsumed = new int[LINE_BUFFERS]; + private boolean eof = false; private boolean finished = false; public RawReadReader(SampleFormat format, InputStream in) { this.format = format; this.in = new LineReader(in); - this.bufferConsumed = 0; + for(int i=0;i= LINE_BUFFERS) { + for(int i=0;i 0) { + // fill buffer + int filled = 0; + for(int i=0;i 0) { + return true; + } else { + return false; + } } else { return true; } } - - public long skipIncompleteRead() throws IOException { + + private long _skipIncompleteFASTARead() throws IOException { if(this.finished) { return 0; } + boolean hasBufferData = _fillBuffer(); + if(!hasBufferData) { + //EOF + this.finished = true; + return 0; + } + long bytesConsumed = 0; + boolean headerFound = false; + while(hasBufferData) { + if(this.buffers[0].getLength() > 0 && this.buffers[0].charAt(0) == Read.FASTA_READ_DESCRIPTION_IDENTIFIER) { + headerFound = true; + break; + } else { + bytesConsumed += this.bufferConsumed[0]; + // refill buffer + _shiftBuffer(1); + hasBufferData = _fillBuffer(); + } + } - boolean hasBufferData = _fillBuffer(false); + if(!headerFound) { + //EOF + this.finished = true; + } + + return bytesConsumed; + } + + private long _skipIncompleteFASTQRead() throws IOException { + if(this.finished) { + return 0; + } + + boolean hasBufferData = _fillBuffer(); if(!hasBufferData) { //EOF this.finished = true; - return bytesConsumed; + return 0; } + long bytesConsumed = 0; boolean headerFound = false; while(hasBufferData) { - if(this.buffer.getLength() > 0 && this.buffer.charAt(0) == this.delimiter) { + int emptyBufferCount = _countEmptyBuffer(); + if(this.buffers[0].getLength() > 0 && this.buffers[0].charAt(0) == Read.FASTQ_READ_DESCRIPTION_IDENTIFIER && + this.buffers[2].getLength() > 0 && this.buffers[2].charAt(0) == Read.FASTQ_READ_DESCRIPTION2_IDENTIFIER && + emptyBufferCount == 0) { headerFound = true; break; } else { - bytesConsumed += this.bufferConsumed; + bytesConsumed += this.bufferConsumed[0]; // refill buffer - hasBufferData = _fillBuffer(true); + _shiftBuffer(1); + hasBufferData = _fillBuffer(); } } if(!headerFound) { //EOF - bytesConsumed += this.bufferConsumed; this.finished = true; - this.bufferConsumed = 0; } return bytesConsumed; } - public long readRead(Read read) throws IOException { + public long skipIncompleteRead() throws IOException { + switch(this.format) { + case FASTA: + return _skipIncompleteFASTARead(); + case FASTQ: + return _skipIncompleteFASTQRead(); + default: + throw new IOException("Unknown format"); + } + } + + private void _printBuffer() { + for(int i=0;i 0 && this.buffers[0].charAt(0) == Read.FASTA_READ_DESCRIPTION_IDENTIFIER) { + // GO! + // add header + List lines = new ArrayList(); + + String lineStr = this.buffers[0].toString(); + if(lineStr.trim().length() > 0) { + lines.add(lineStr); + } + + bytesConsumed += this.bufferConsumed[0]; + _shiftBuffer(1); + hasBufferData = _fillBuffer(); + + boolean nextHeaderFound = false; + while(hasBufferData) { + if(this.buffers[0].getLength() > 0 && this.buffers[0].charAt(0) == Read.FASTA_READ_DESCRIPTION_IDENTIFIER) { + nextHeaderFound = true; + break; + } else { + lineStr = this.buffers[0].toString(); + if(lineStr.trim().length() > 0) { + lines.add(lineStr); + } + + bytesConsumed += this.bufferConsumed[0]; + // refill buffer + _shiftBuffer(1); + hasBufferData = _fillBuffer(); + } + } + + if(!nextHeaderFound) { + //EOF + this.finished = true; + } + + read.parse(lines); + } else { + throw new IOException(String.format("Unknown data for FASTA read - %s", this.buffers[0].toString())); + } + + return bytesConsumed; + } + + private long _readFASTQRead(Read read) throws IOException { read.clear(); long bytesConsumed = 0; @@ -149,40 +325,50 @@ public long readRead(Read read) throws IOException { } // check buffer has a header - boolean hasBufferData = _fillBuffer(false); + boolean hasBufferData = _fillBuffer(); if(!hasBufferData) { //EOF this.finished = true; return bytesConsumed; } - if(this.buffer.getLength() > 0 && this.buffer.charAt(0) == this.delimiter) { + int emptyBufferCount = _countEmptyBuffer(); + if(this.buffers[0].getLength() > 0 && this.buffers[0].charAt(0) == Read.FASTQ_READ_DESCRIPTION_IDENTIFIER && + this.buffers[2].getLength() > 0 && this.buffers[2].charAt(0) == Read.FASTQ_READ_DESCRIPTION2_IDENTIFIER && + emptyBufferCount == 0) { // GO! // add header List lines = new ArrayList(); - String lineStr = this.buffer.toString(); + String lineStr = this.buffers[0].toString(); if(lineStr.trim().length() > 0) { lines.add(lineStr); } - bytesConsumed += this.bufferConsumed; - hasBufferData = _fillBuffer(true); + bytesConsumed += this.bufferConsumed[0]; + _shiftBuffer(1); + hasBufferData = _fillBuffer(); boolean nextHeaderFound = false; while(hasBufferData) { - if(this.buffer.getLength() > 0 && this.buffer.charAt(0) == this.delimiter) { + emptyBufferCount = _countEmptyBuffer(); + //_printBuffer(); + + if(this.buffers[0].getLength() > 0 && this.buffers[0].charAt(0) == Read.FASTQ_READ_DESCRIPTION_IDENTIFIER && + this.buffers[2].getLength() > 0 && this.buffers[2].charAt(0) == Read.FASTQ_READ_DESCRIPTION2_IDENTIFIER && + emptyBufferCount == 0) { nextHeaderFound = true; break; } else { - lineStr = this.buffer.toString(); + lineStr = this.buffers[0].toString(); if(lineStr.trim().length() > 0) { lines.add(lineStr); } - bytesConsumed += this.bufferConsumed; + bytesConsumed += this.bufferConsumed[0]; // refill buffer - hasBufferData = _fillBuffer(true); + _shiftBuffer(1); + hasBufferData = _fillBuffer(); } } @@ -193,9 +379,20 @@ public long readRead(Read read) throws IOException { read.parse(lines); } else { - throw new IOException(String.format("Unknown data - %s", this.buffer.toString())); + throw new IOException(String.format("Unknown data for FASTQ read - %s", this.buffers[0].toString())); } return bytesConsumed; } + + public long readRead(Read read) throws IOException { + switch(this.format) { + case FASTA: + return _readFASTARead(read); + case FASTQ: + return _readFASTQRead(read); + default: + throw new IOException("Unknown format"); + } + } }