From 0df7b35de37e5c1f447267133bc99c0888cdb134 Mon Sep 17 00:00:00 2001 From: Susanna Kiwala Date: Thu, 24 Oct 2019 12:02:47 -0500 Subject: [PATCH] Fix bug in split_file function of net_chop.py and netmhc_stab.py The bug would result in the second (and subsequent chunk) of the split file having one entry removed, which would then also be missing from the output file, effectively dropping epitope predictions from the filtered files when there were more than 100 entries. This commit also moved the method to lib/utils.py so that it isn't duplicated. --- lib/net_chop.py | 15 ++------------- lib/netmhc_stab.py | 15 ++------------- lib/utils.py | 8 ++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/lib/net_chop.py b/lib/net_chop.py index b59b2b63f..ac8a532d4 100644 --- a/lib/net_chop.py +++ b/lib/net_chop.py @@ -7,22 +7,11 @@ import os from time import sleep import collections +import lib.utils cycle = ['|', '/', '-', '\\'] methods = ['cterm', '20s'] -def split_file(reader, lines=400): - from itertools import islice, chain - for tmp in reader: - if tmp != "": - yield chain([tmp], islice(reader, lines-1)) - try: - tmp = next(reader) - except StopIteration: - return - else: - break - def main(args_input = sys.argv[1:]): parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( @@ -64,7 +53,7 @@ def main(args_input = sys.argv[1:]): i=1 print("Waiting for results from NetChop... |", end='') sys.stdout.flush() - for chunk in split_file(reader, 100): + for chunk in lib.utils.split_file(reader, 100): staging_file = tempfile.NamedTemporaryFile(mode='w+') current_buffer = {} for line in chunk: diff --git a/lib/netmhc_stab.py b/lib/netmhc_stab.py index b08f6ea8f..95b7c5a1d 100644 --- a/lib/netmhc_stab.py +++ b/lib/netmhc_stab.py @@ -6,22 +6,11 @@ import re import os from time import sleep +import lib.utils cycle = ['|', '/', '-', '\\'] methods = ['cterm', '20s'] -def split_file(reader, lines=400): - from itertools import islice, chain - for tmp in reader: - if tmp != "": - yield chain([tmp], islice(reader, lines-1)) - try: - tmp = next(reader) - except StopIteration: - return - else: - break - def main(args_input = sys.argv[1:]): parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( @@ -51,7 +40,7 @@ def main(args_input = sys.argv[1:]): i=1 print("Waiting for results from NetMHCStabPan... |", end='') sys.stdout.flush() - for chunk in split_file(reader, 100): + for chunk in lib.utils.split_file(reader, 100): peptide_lengths = set() staging_file = tempfile.NamedTemporaryFile(mode='w+') current_buffer = {} diff --git a/lib/utils.py b/lib/utils.py index b49cafe72..a6e00cea3 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,5 +1,13 @@ import binascii +from itertools import islice def is_gz_file(filepath): with open(filepath, 'rb') as test_f: return binascii.hexlify(test_f.read(2)) == b'1f8b' + +def split_file(reader, lines): + i = iter(reader) + piece = list(islice(i, lines)) + while piece: + yield piece + piece = list(islice(i, lines))