From 0df7b35de37e5c1f447267133bc99c0888cdb134 Mon Sep 17 00:00:00 2001
From: Susanna Kiwala <susanna.kiwala@wustl.edu>
Date: Thu, 24 Oct 2019 12:02:47 -0500
Subject: [PATCH] Fix bug in split_file function of net_chop.py and
 netmhc_stab.py

The bug would result in the second (and subsequent chunk) of the split
file having one entry removed, which would then also be missing from the
output file, effectively dropping epitope predictions from the filtered
files when there were more than 100 entries.

This commit also moved the method to lib/utils.py so that it isn't
duplicated.
---
 lib/net_chop.py    | 15 ++-------------
 lib/netmhc_stab.py | 15 ++-------------
 lib/utils.py       |  8 ++++++++
 3 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/lib/net_chop.py b/lib/net_chop.py
index b59b2b63f..ac8a532d4 100644
--- a/lib/net_chop.py
+++ b/lib/net_chop.py
@@ -7,22 +7,11 @@
 import os
 from time import sleep
 import collections
+import lib.utils
 
 cycle = ['|', '/', '-', '\\']
 methods = ['cterm', '20s']
 
-def split_file(reader, lines=400):
-    from itertools import islice, chain
-    for tmp in reader:
-        if tmp != "":
-            yield chain([tmp], islice(reader, lines-1))
-            try:
-                tmp = next(reader)
-            except StopIteration:
-                return
-        else:
-            break
-
 def main(args_input = sys.argv[1:]):
     parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
@@ -64,7 +53,7 @@ def main(args_input = sys.argv[1:]):
     i=1
     print("Waiting for results from NetChop... |", end='')
     sys.stdout.flush()
-    for chunk in split_file(reader, 100):
+    for chunk in lib.utils.split_file(reader, 100):
         staging_file = tempfile.NamedTemporaryFile(mode='w+')
         current_buffer = {}
         for line in chunk:
diff --git a/lib/netmhc_stab.py b/lib/netmhc_stab.py
index b08f6ea8f..95b7c5a1d 100644
--- a/lib/netmhc_stab.py
+++ b/lib/netmhc_stab.py
@@ -6,22 +6,11 @@
 import re
 import os
 from time import sleep
+import lib.utils
 
 cycle = ['|', '/', '-', '\\']
 methods = ['cterm', '20s']
 
-def split_file(reader, lines=400):
-    from itertools import islice, chain
-    for tmp in reader:
-        if tmp != "":
-            yield chain([tmp], islice(reader, lines-1))
-            try:
-                tmp = next(reader)
-            except StopIteration:
-                return
-        else:
-            break
-
 def main(args_input = sys.argv[1:]):
     parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
@@ -51,7 +40,7 @@ def main(args_input = sys.argv[1:]):
     i=1
     print("Waiting for results from NetMHCStabPan... |", end='')
     sys.stdout.flush()
-    for chunk in split_file(reader, 100):
+    for chunk in lib.utils.split_file(reader, 100):
         peptide_lengths = set()
         staging_file = tempfile.NamedTemporaryFile(mode='w+')
         current_buffer = {}
diff --git a/lib/utils.py b/lib/utils.py
index b49cafe72..a6e00cea3 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -1,5 +1,13 @@
 import binascii
+from itertools import islice
 
 def is_gz_file(filepath):
     with open(filepath, 'rb') as test_f:
         return binascii.hexlify(test_f.read(2)) == b'1f8b'
+
+def split_file(reader, lines):
+    i = iter(reader)
+    piece = list(islice(i, lines))
+    while piece:
+        yield piece
+        piece = list(islice(i, lines))