Merge remote-tracking branch 'origin/hotfix'

griffithlab · Oct 25, 2019 · dc0fda2 · dc0fda2
2 parents 6b8a8ca + b91e136
commit dc0fda2
Show file tree

Hide file tree

Showing 28 changed files with 1,029 additions and 506 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -69,7 +69,7 @@
 # The short X.Y version.
 version = '1.5'
 # The full version, including alpha/beta/rc tags.
-release = '1.5.2'
+release = '1.5.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/index.rst b/docs/index.rst
@@ -52,9 +52,21 @@ New in release |release|
 
 This is a hotfix release. It fixes the following issues:
 
-- AGFusion exon files may be comma-delimited. Previously, the file parser
-  assumed the files were tab-delimited. This release now allows AGFusion
-  inputs that are comma- or tab-delimited.
+- pVACbind would previously throw an error if a peptide sequence in the input
+  fasta was shorter than one of the chosen epitope lengths. This issue has
+  been fixed by first parsing the input fasta and creating individual fasta
+  files for each epitope length that enforce a minimum length of the peptide
+  sequences matching the respective epitope length.
+- Previous versions of pVACtools resolved an issue where IEDB would output a
+  warning line if one of the epitope sequences only contained A, C, G, or T
+  amino acids, since those sequences could also be nuclotide sequences.
+  However, this issue was only fixed in pVACseq, not pVACbind, or pVACvector.
+  This release fixes this issue for all tools.
+- The wrappers for NetChop or NetMHCstabpan split the set of input epitopes
+  into chunks of 100 before processing. Due to a bug in the file splitting
+  logic, one epitope for each chunk over 100 would be errenously dropped. This
+  effectively would result in less epitopes being returned in the filtered
+  report than if running the pipelines without NetChop or NetMHCstabpan.
 
 New in version |version|
 ------------------------

diff --git a/lib/net_chop.py b/lib/net_chop.py
@@ -7,22 +7,11 @@
 import os
 from time import sleep
 import collections
+import lib.utils
 
 cycle = ['|', '/', '-', '\\']
 methods = ['cterm', '20s']
 
-def split_file(reader, lines=400):
-    from itertools import islice, chain
-    for tmp in reader:
-        if tmp != "":
-            yield chain([tmp], islice(reader, lines-1))
-            try:
-                tmp = next(reader)
-            except StopIteration:
-                return
-        else:
-            break
-
 def main(args_input = sys.argv[1:]):
     parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
@@ -64,7 +53,7 @@ def main(args_input = sys.argv[1:]):
     i=1
     print("Waiting for results from NetChop... |", end='')
     sys.stdout.flush()
-    for chunk in split_file(reader, 100):
+    for chunk in lib.utils.split_file(reader, 100):
         staging_file = tempfile.NamedTemporaryFile(mode='w+')
         current_buffer = {}
         for line in chunk:

diff --git a/lib/netmhc_stab.py b/lib/netmhc_stab.py
@@ -6,22 +6,11 @@
 import re
 import os
 from time import sleep
+import lib.utils
 
 cycle = ['|', '/', '-', '\\']
 methods = ['cterm', '20s']
 
-def split_file(reader, lines=400):
-    from itertools import islice, chain
-    for tmp in reader:
-        if tmp != "":
-            yield chain([tmp], islice(reader, lines-1))
-            try:
-                tmp = next(reader)
-            except StopIteration:
-                return
-        else:
-            break
-
 def main(args_input = sys.argv[1:]):
     parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument(
@@ -51,7 +40,7 @@ def main(args_input = sys.argv[1:]):
     i=1
     print("Waiting for results from NetMHCStabPan... |", end='')
     sys.stdout.flush()
-    for chunk in split_file(reader, 100):
+    for chunk in lib.utils.split_file(reader, 100):
         peptide_lengths = set()
         staging_file = tempfile.NamedTemporaryFile(mode='w+')
         current_buffer = {}

diff --git a/lib/pipeline.py b/lib/pipeline.py
@@ -480,8 +480,11 @@ def fasta_entry_count(self):
                     row_count += 1
         return row_count
 
-    def split_fasta_basename(self):
-        return os.path.join(self.tmp_dir, self.sample_name + ".fa.split")
+    def fasta_basename(self, length):
+        return os.path.join(self.tmp_dir, "{}.{}.fa".format(self.sample_name, length))
+
+    def split_fasta_basename(self, length):
+        return "{}.split".format(self.fasta_basename(length))
 
     def uniquify_records(self, records):
         fasta_sequences = OrderedDict()
@@ -497,7 +500,14 @@ def uniquify_records(self, records):
             count += 1
         return (uniq_records, keys)
 
-    def split_fasta_file(self):
+    def create_per_length_fasta(self, length):
+        records = []
+        for record in SeqIO.parse(self.input_file, "fasta"):
+            if len(str(record.seq)) >= length:
+                records.append(record)
+        SeqIO.write(records, self.fasta_basename(length), "fasta")
+
+    def split_fasta_file(self, length):
         fasta_entry_count = self.fasta_entry_count()
         status_message("Splitting FASTA into smaller chunks")
         chunks = []
@@ -508,7 +518,7 @@ def split_fasta_file(self):
         if split_end > fasta_entry_count:
             split_end = fasta_entry_count
         status_message("Splitting FASTA into smaller chunks - Entries %d-%d" % (split_start, split_end))
-        split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(), split_start, split_end)
+        split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(length), split_start, split_end)
         split_fasta_key_file_path = "{}.key".format(split_fasta_file_path)
         chunks.append([split_start, split_end])
         if os.path.exists(split_fasta_file_path):
@@ -517,7 +527,7 @@ def split_fasta_file(self):
         else:
             split_fasta_records = []
             skip = 0
-        for record in SeqIO.parse(self.input_file, "fasta"):
+        for record in SeqIO.parse(self.fasta_basename(length), "fasta"):
             if skip == 0:
                 split_fasta_records.append(record)
             if row_count == fasta_entry_count:
@@ -535,7 +545,7 @@ def split_fasta_file(self):
                 if split_end > fasta_entry_count:
                     split_end = fasta_entry_count
                 status_message("Splitting FASTA into smaller chunks - Entries %d-%d" % (split_start, split_end))
-                split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(), split_start, split_end)
+                split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(length), split_start, split_end)
                 split_fasta_key_file_path = "{}.key".format(split_fasta_file_path)
                 chunks.append([split_start, split_end])
                 if os.path.exists(split_fasta_file_path):
@@ -554,12 +564,145 @@ def split_fasta_file(self):
         status_message("Completed")
         return chunks
 
+    def call_iedb(self, chunks, length):
+        alleles = self.alleles
+        prediction_algorithms = self.prediction_algorithms
+        argument_sets = []
+        warning_messages = []
+        for (split_start, split_end) in chunks:
+            tsv_chunk = "%d-%d" % (split_start, split_end)
+            if self.input_file_type == 'fasta':
+                fasta_chunk = tsv_chunk
+            else:
+                fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
+            for a in alleles:
+                split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(length), fasta_chunk)
+                if os.path.getsize(split_fasta_file_path) == 0:
+                    msg = "Fasta file {} is empty. Skipping".format(split_fasta_file_path)
+                    if msg not in warning_messages:
+                        warning_messages.append(msg)
+                    continue
+                #begin of per-algorithm processing
+                for method in prediction_algorithms:
+                    prediction_class = globals()[method]
+                    prediction = prediction_class()
+                    if hasattr(prediction, 'iedb_prediction_method'):
+                        iedb_method = prediction.iedb_prediction_method
+                    else:
+                        iedb_method = method
+                    valid_alleles = prediction.valid_allele_names()
+                    if a not in valid_alleles:
+                        msg = "Allele %s not valid for Method %s. Skipping." % (a, method)
+                        if msg not in warning_messages:
+                            warning_messages.append(msg)
+                        continue
+                    valid_lengths = prediction.valid_lengths_for_allele(a)
+                    if length not in valid_lengths:
+                        msg = "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (length, method, a)
+                        if msg not in warning_messages:
+                            warning_messages.append(msg)
+                        continue
+
+                    split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk]))
+                    if os.path.exists(split_iedb_out):
+                        msg = "Prediction file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, length, method, fasta_chunk)
+                        if msg not in warning_messages:
+                            warning_messages.append(msg)
+                        continue
+                    arguments = [
+                        split_fasta_file_path,
+                        split_iedb_out,
+                        method,
+                        a,
+                        '-r', str(self.iedb_retries),
+                        '-e', self.iedb_executable,
+                    ]
+                    if not isinstance(prediction, IEDBMHCII):
+                        arguments.extend(['-l', str(length),])
+                    argument_sets.append(arguments)
+
+        for msg in warning_messages:
+            status_message(msg)
+
+        with pymp.Parallel(self.n_threads) as p:
+            for index in p.range(len(argument_sets)):
+                arguments = argument_sets[index]
+                a = arguments[3]
+                method = arguments[2]
+                filename = arguments[1]
+                if len(arguments) == 10:
+                    epl = arguments[9]
+                else:
+                    epl = 15
+                p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s" % (a, epl, method, filename))
+                lib.call_iedb.main(arguments)
+                p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s - Completed" % (a, epl, method, filename))
+
+    def parse_outputs(self, chunks, length):
+        split_parsed_output_files = []
+        for (split_start, split_end) in chunks:
+            tsv_chunk = "%d-%d" % (split_start, split_end)
+            if self.input_file_type == 'fasta':
+                fasta_chunk = tsv_chunk
+            else:
+                fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
+            for a in self.alleles:
+                split_iedb_output_files = []
+                status_message("Parsing binding predictions for Allele %s and Epitope Length %s - Entries %s" % (a, length, fasta_chunk))
+                for method in self.prediction_algorithms:
+                    prediction_class = globals()[method]
+                    prediction = prediction_class()
+                    if hasattr(prediction, 'iedb_prediction_method'):
+                        iedb_method = prediction.iedb_prediction_method
+                    else:
+                        iedb_method = method
+                    valid_alleles = prediction.valid_allele_names()
+                    if a not in valid_alleles:
+                        continue
+                    valid_lengths = prediction.valid_lengths_for_allele(a)
+                    if length not in valid_lengths:
+                        continue
+                    split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk]))
+                    if os.path.exists(split_iedb_out):
+                        split_iedb_output_files.append(split_iedb_out)
+
+                split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, str(length), "parsed", "tsv_%s" % fasta_chunk]))
+                if os.path.exists(split_parsed_file_path):
+                    status_message("Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, length, fasta_chunk))
+                    split_parsed_output_files.append(split_parsed_file_path)
+                    continue
+                split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(length), fasta_chunk)
+                split_fasta_key_file_path = split_fasta_file_path + '.key'
+
+                if len(split_iedb_output_files) > 0:
+                    status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s" % (a, length, fasta_chunk))
+                    split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk)
+                    params = {
+                        'input_iedb_files'       : split_iedb_output_files,
+                        'input_tsv_file'         : split_tsv_file_path,
+                        'key_file'               : split_fasta_key_file_path,
+                        'output_file'            : split_parsed_file_path,
+                    }
+                    if self.additional_report_columns and 'sample_name' in self.additional_report_columns:
+                        params['sample_name'] = self.sample_name
+                    else:
+                        params['sample_name'] = None
+                    parser = self.output_parser(params)
+                    parser.execute()
+                    status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s - Completed" % (a, length, fasta_chunk))
+
+                    split_parsed_output_files.append(split_parsed_file_path)
+        return split_parsed_output_files
+
     def execute(self):
         self.print_log()
 
-        chunks = self.split_fasta_file()
-        self.call_iedb(chunks)
-        split_parsed_output_files = self.parse_outputs(chunks)
+        split_parsed_output_files = []
+        for length in self.epitope_lengths:
+            self.create_per_length_fasta(length)
+            chunks = self.split_fasta_file(length)
+            self.call_iedb(chunks, length)
+            split_parsed_output_files.extend(self.parse_outputs(chunks, length))
 
         if len(split_parsed_output_files) == 0:
             status_message("No output files were created. Aborting.")

diff --git a/lib/utils.py b/lib/utils.py
@@ -1,5 +1,13 @@
 import binascii
+from itertools import islice
 
 def is_gz_file(filepath):
     with open(filepath, 'rb') as test_f:
         return binascii.hexlify(test_f.read(2)) == b'1f8b'
+
+def split_file(reader, lines):
+    i = iter(reader)
+    piece = list(islice(i, lines))
+    while piece:
+        yield piece
+        piece = list(islice(i, lines))
diff --git a/setup.py b/setup.py
@@ -61,7 +61,7 @@
 
 setup(
     name="pvactools",
-    version="1.5.2",
+    version="1.5.3",
     packages=[
         "tools",
         "tools.pvacbind",

diff --git a/...acbind/MHC_Class_I/tmp/Test.fa.split_1-48 → ...ind/MHC_Class_I/tmp/Test.10.fa.split_1-48 b/...acbind/MHC_Class_I/tmp/Test.fa.split_1-48 → ...ind/MHC_Class_I/tmp/Test.10.fa.split_1-48
diff --git a/...nd/MHC_Class_I/tmp/Test.fa.split_1-48.key → ...MHC_Class_I/tmp/Test.10.fa.split_1-48.key b/...nd/MHC_Class_I/tmp/Test.fa.split_1-48.key → ...MHC_Class_I/tmp/Test.10.fa.split_1-48.key
diff --git a/...cbind/MHC_Class_II/tmp/Test.fa.split_1-48 → ...bind/MHC_Class_I/tmp/Test.9.fa.split_1-48 b/...cbind/MHC_Class_II/tmp/Test.fa.split_1-48 → ...bind/MHC_Class_I/tmp/Test.9.fa.split_1-48
diff --git a/...d/MHC_Class_II/tmp/Test.fa.split_1-48.key → .../MHC_Class_I/tmp/Test.9.fa.split_1-48.key b/...d/MHC_Class_II/tmp/Test.fa.split_1-48.key → .../MHC_Class_I/tmp/Test.9.fa.split_1-48.key
diff --git a/...ndense/MHC_Class_I/tmp/Test.fa.split_1-48 → ...nd/MHC_Class_II/tmp/Test.15.fa.split_1-48 b/...ndense/MHC_Class_I/tmp/Test.fa.split_1-48 → ...nd/MHC_Class_II/tmp/Test.15.fa.split_1-48
diff --git a/...se/MHC_Class_I/tmp/Test.fa.split_1-48.key → ...HC_Class_II/tmp/Test.15.fa.split_1-48.key b/...se/MHC_Class_I/tmp/Test.fa.split_1-48.key → ...HC_Class_II/tmp/Test.15.fa.split_1-48.key