Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/hotfix'
Browse files Browse the repository at this point in the history
  • Loading branch information
susannasiebert committed Oct 25, 2019
2 parents 6b8a8ca + b91e136 commit dc0fda2
Show file tree
Hide file tree
Showing 28 changed files with 1,029 additions and 506 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
# The short X.Y version.
version = '1.5'
# The full version, including alpha/beta/rc tags.
release = '1.5.2'
release = '1.5.3'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
18 changes: 15 additions & 3 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,21 @@ New in release |release|

This is a hotfix release. It fixes the following issues:

- AGFusion exon files may be comma-delimited. Previously, the file parser
assumed the files were tab-delimited. This release now allows AGFusion
inputs that are comma- or tab-delimited.
- pVACbind would previously throw an error if a peptide sequence in the input
fasta was shorter than one of the chosen epitope lengths. This issue has
been fixed by first parsing the input fasta and creating individual fasta
files for each epitope length that enforce a minimum length of the peptide
sequences matching the respective epitope length.
- Previous versions of pVACtools resolved an issue where IEDB would output a
warning line if one of the epitope sequences only contained A, C, G, or T
amino acids, since those sequences could also be nuclotide sequences.
However, this issue was only fixed in pVACseq, not pVACbind, or pVACvector.
This release fixes this issue for all tools.
- The wrappers for NetChop or NetMHCstabpan split the set of input epitopes
into chunks of 100 before processing. Due to a bug in the file splitting
logic, one epitope for each chunk over 100 would be errenously dropped. This
effectively would result in less epitopes being returned in the filtered
report than if running the pipelines without NetChop or NetMHCstabpan.

New in version |version|
------------------------
Expand Down
15 changes: 2 additions & 13 deletions lib/net_chop.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,11 @@
import os
from time import sleep
import collections
import lib.utils

cycle = ['|', '/', '-', '\\']
methods = ['cterm', '20s']

def split_file(reader, lines=400):
from itertools import islice, chain
for tmp in reader:
if tmp != "":
yield chain([tmp], islice(reader, lines-1))
try:
tmp = next(reader)
except StopIteration:
return
else:
break

def main(args_input = sys.argv[1:]):
parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
Expand Down Expand Up @@ -64,7 +53,7 @@ def main(args_input = sys.argv[1:]):
i=1
print("Waiting for results from NetChop... |", end='')
sys.stdout.flush()
for chunk in split_file(reader, 100):
for chunk in lib.utils.split_file(reader, 100):
staging_file = tempfile.NamedTemporaryFile(mode='w+')
current_buffer = {}
for line in chunk:
Expand Down
15 changes: 2 additions & 13 deletions lib/netmhc_stab.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,11 @@
import re
import os
from time import sleep
import lib.utils

cycle = ['|', '/', '-', '\\']
methods = ['cterm', '20s']

def split_file(reader, lines=400):
from itertools import islice, chain
for tmp in reader:
if tmp != "":
yield chain([tmp], islice(reader, lines-1))
try:
tmp = next(reader)
except StopIteration:
return
else:
break

def main(args_input = sys.argv[1:]):
parser = argparse.ArgumentParser("pvacseq net_chop", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
Expand Down Expand Up @@ -51,7 +40,7 @@ def main(args_input = sys.argv[1:]):
i=1
print("Waiting for results from NetMHCStabPan... |", end='')
sys.stdout.flush()
for chunk in split_file(reader, 100):
for chunk in lib.utils.split_file(reader, 100):
peptide_lengths = set()
staging_file = tempfile.NamedTemporaryFile(mode='w+')
current_buffer = {}
Expand Down
161 changes: 152 additions & 9 deletions lib/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,11 @@ def fasta_entry_count(self):
row_count += 1
return row_count

def split_fasta_basename(self):
return os.path.join(self.tmp_dir, self.sample_name + ".fa.split")
def fasta_basename(self, length):
return os.path.join(self.tmp_dir, "{}.{}.fa".format(self.sample_name, length))

def split_fasta_basename(self, length):
return "{}.split".format(self.fasta_basename(length))

def uniquify_records(self, records):
fasta_sequences = OrderedDict()
Expand All @@ -497,7 +500,14 @@ def uniquify_records(self, records):
count += 1
return (uniq_records, keys)

def split_fasta_file(self):
def create_per_length_fasta(self, length):
records = []
for record in SeqIO.parse(self.input_file, "fasta"):
if len(str(record.seq)) >= length:
records.append(record)
SeqIO.write(records, self.fasta_basename(length), "fasta")

def split_fasta_file(self, length):
fasta_entry_count = self.fasta_entry_count()
status_message("Splitting FASTA into smaller chunks")
chunks = []
Expand All @@ -508,7 +518,7 @@ def split_fasta_file(self):
if split_end > fasta_entry_count:
split_end = fasta_entry_count
status_message("Splitting FASTA into smaller chunks - Entries %d-%d" % (split_start, split_end))
split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(), split_start, split_end)
split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(length), split_start, split_end)
split_fasta_key_file_path = "{}.key".format(split_fasta_file_path)
chunks.append([split_start, split_end])
if os.path.exists(split_fasta_file_path):
Expand All @@ -517,7 +527,7 @@ def split_fasta_file(self):
else:
split_fasta_records = []
skip = 0
for record in SeqIO.parse(self.input_file, "fasta"):
for record in SeqIO.parse(self.fasta_basename(length), "fasta"):
if skip == 0:
split_fasta_records.append(record)
if row_count == fasta_entry_count:
Expand All @@ -535,7 +545,7 @@ def split_fasta_file(self):
if split_end > fasta_entry_count:
split_end = fasta_entry_count
status_message("Splitting FASTA into smaller chunks - Entries %d-%d" % (split_start, split_end))
split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(), split_start, split_end)
split_fasta_file_path = "%s_%d-%d" % (self.split_fasta_basename(length), split_start, split_end)
split_fasta_key_file_path = "{}.key".format(split_fasta_file_path)
chunks.append([split_start, split_end])
if os.path.exists(split_fasta_file_path):
Expand All @@ -554,12 +564,145 @@ def split_fasta_file(self):
status_message("Completed")
return chunks

def call_iedb(self, chunks, length):
alleles = self.alleles
prediction_algorithms = self.prediction_algorithms
argument_sets = []
warning_messages = []
for (split_start, split_end) in chunks:
tsv_chunk = "%d-%d" % (split_start, split_end)
if self.input_file_type == 'fasta':
fasta_chunk = tsv_chunk
else:
fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
for a in alleles:
split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(length), fasta_chunk)
if os.path.getsize(split_fasta_file_path) == 0:
msg = "Fasta file {} is empty. Skipping".format(split_fasta_file_path)
if msg not in warning_messages:
warning_messages.append(msg)
continue
#begin of per-algorithm processing
for method in prediction_algorithms:
prediction_class = globals()[method]
prediction = prediction_class()
if hasattr(prediction, 'iedb_prediction_method'):
iedb_method = prediction.iedb_prediction_method
else:
iedb_method = method
valid_alleles = prediction.valid_allele_names()
if a not in valid_alleles:
msg = "Allele %s not valid for Method %s. Skipping." % (a, method)
if msg not in warning_messages:
warning_messages.append(msg)
continue
valid_lengths = prediction.valid_lengths_for_allele(a)
if length not in valid_lengths:
msg = "Epitope Length %s is not valid for Method %s and Allele %s. Skipping." % (length, method, a)
if msg not in warning_messages:
warning_messages.append(msg)
continue

split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk]))
if os.path.exists(split_iedb_out):
msg = "Prediction file for Allele %s and Epitope Length %s with Method %s (Entries %s) already exists. Skipping." % (a, length, method, fasta_chunk)
if msg not in warning_messages:
warning_messages.append(msg)
continue
arguments = [
split_fasta_file_path,
split_iedb_out,
method,
a,
'-r', str(self.iedb_retries),
'-e', self.iedb_executable,
]
if not isinstance(prediction, IEDBMHCII):
arguments.extend(['-l', str(length),])
argument_sets.append(arguments)

for msg in warning_messages:
status_message(msg)

with pymp.Parallel(self.n_threads) as p:
for index in p.range(len(argument_sets)):
arguments = argument_sets[index]
a = arguments[3]
method = arguments[2]
filename = arguments[1]
if len(arguments) == 10:
epl = arguments[9]
else:
epl = 15
p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s" % (a, epl, method, filename))
lib.call_iedb.main(arguments)
p.print("Making binding predictions on Allele %s and Epitope Length %s with Method %s - File %s - Completed" % (a, epl, method, filename))

def parse_outputs(self, chunks, length):
split_parsed_output_files = []
for (split_start, split_end) in chunks:
tsv_chunk = "%d-%d" % (split_start, split_end)
if self.input_file_type == 'fasta':
fasta_chunk = tsv_chunk
else:
fasta_chunk = "%d-%d" % (split_start*2-1, split_end*2)
for a in self.alleles:
split_iedb_output_files = []
status_message("Parsing binding predictions for Allele %s and Epitope Length %s - Entries %s" % (a, length, fasta_chunk))
for method in self.prediction_algorithms:
prediction_class = globals()[method]
prediction = prediction_class()
if hasattr(prediction, 'iedb_prediction_method'):
iedb_method = prediction.iedb_prediction_method
else:
iedb_method = method
valid_alleles = prediction.valid_allele_names()
if a not in valid_alleles:
continue
valid_lengths = prediction.valid_lengths_for_allele(a)
if length not in valid_lengths:
continue
split_iedb_out = os.path.join(self.tmp_dir, ".".join([self.sample_name, iedb_method, a, str(length), "tsv_%s" % fasta_chunk]))
if os.path.exists(split_iedb_out):
split_iedb_output_files.append(split_iedb_out)

split_parsed_file_path = os.path.join(self.tmp_dir, ".".join([self.sample_name, a, str(length), "parsed", "tsv_%s" % fasta_chunk]))
if os.path.exists(split_parsed_file_path):
status_message("Parsed Output File for Allele %s and Epitope Length %s (Entries %s) already exists. Skipping" % (a, length, fasta_chunk))
split_parsed_output_files.append(split_parsed_file_path)
continue
split_fasta_file_path = "%s_%s"%(self.split_fasta_basename(length), fasta_chunk)
split_fasta_key_file_path = split_fasta_file_path + '.key'

if len(split_iedb_output_files) > 0:
status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s" % (a, length, fasta_chunk))
split_tsv_file_path = "%s_%s" % (self.tsv_file_path(), tsv_chunk)
params = {
'input_iedb_files' : split_iedb_output_files,
'input_tsv_file' : split_tsv_file_path,
'key_file' : split_fasta_key_file_path,
'output_file' : split_parsed_file_path,
}
if self.additional_report_columns and 'sample_name' in self.additional_report_columns:
params['sample_name'] = self.sample_name
else:
params['sample_name'] = None
parser = self.output_parser(params)
parser.execute()
status_message("Parsing prediction file for Allele %s and Epitope Length %s - Entries %s - Completed" % (a, length, fasta_chunk))

split_parsed_output_files.append(split_parsed_file_path)
return split_parsed_output_files

def execute(self):
self.print_log()

chunks = self.split_fasta_file()
self.call_iedb(chunks)
split_parsed_output_files = self.parse_outputs(chunks)
split_parsed_output_files = []
for length in self.epitope_lengths:
self.create_per_length_fasta(length)
chunks = self.split_fasta_file(length)
self.call_iedb(chunks, length)
split_parsed_output_files.extend(self.parse_outputs(chunks, length))

if len(split_parsed_output_files) == 0:
status_message("No output files were created. Aborting.")
Expand Down
8 changes: 8 additions & 0 deletions lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
import binascii
from itertools import islice

def is_gz_file(filepath):
with open(filepath, 'rb') as test_f:
return binascii.hexlify(test_f.read(2)) == b'1f8b'

def split_file(reader, lines):
i = iter(reader)
piece = list(islice(i, lines))
while piece:
yield piece
piece = list(islice(i, lines))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@

setup(
name="pvactools",
version="1.5.2",
version="1.5.3",
packages=[
"tools",
"tools.pvacbind",
Expand Down
Loading

0 comments on commit dc0fda2

Please sign in to comment.