Merge pull request #163 from griffithlab/hotfix

New version 3.0.5
griffithlab · Aug 22, 2016 · 4caf8b5 · 4caf8b5
2 parents 79c45be + 8463e35
commit 4caf8b5
Show file tree

Hide file tree

Showing 10 changed files with 46 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -2,12 +2,19 @@
 Cancer immunotherapy has gained significant momentum from recent clinical successes of checkpoint blockade inhibition. Massively parallel sequence analysis suggests a connection between mutational load and response to this class of therapy. Methods to identify which tumor-specific mutant peptides (neoantigens) can elicit anti-tumor T cell immunity are needed to improve predictions of checkpoint therapy response and to identify targets for vaccines and adoptive T cell therapies. Here, we provide a cancer immunotherapy pipeline for the identification of **p**ersonalized **V**ariant **A**ntigens by **C**ancer **Seq**uencing (pVAC-Seq) that integrates tumor mutation and expression data (DNA- and RNA-Seq).
 http://www.genomemedicine.com/content/8/1/11
 
+## New in version 3.0.5
+<ul>
+<li>Bugfix: The generation of the fasta file would fail for some insertions with a range position. This is now fixed.</li>
+<li>Bugfix: The generation of the fasta file would fail if the wildtype or downstream sequences were too long. The size limit for these fields has been increased to the user system's maximum supported size. This error might still occur if the sequences are longer than that.</li>
+<li>Bugfix: When rerunning a command an error would occur if the <code>tmp</code> subdirectory already exists in the output directory. This has now been fixed.</li>
+</ul>
+
 ## New in version 3.0.4
 <ul>
 <li>Certain intermediate files are now written into a <code>tmp</code> directory underneath the main output directory. This <code>tmp</code> directory will be deleted at the end of a successful run unless the <code>--keep-tmp-files</code> flag is set.</li>
 <li>Intermediate files will now not be reprocessed if they already exist in the output directory. This can be helpful if a run exits early, for example, when a 500 Error was returned by IEDB. In this case the user can now simply run the same <code>pvacseq run</code> command again and the run will pick up where it failed previously.</li>
 <li>We added a new option <code>--fasta-size</code> that the user can set to specify how many FASTA entries at a time will be included in a request to the IEDB RESTful API. The default is 200 but certain variants or prediction algorithms might warrant a smaller number of FASTA entries in order to avoid timeouts from IEDB.</li>
-<li>Bugfix: The parsing step would fail for frameshift mutations with a range position. This is now fixed.</li>
+<li>Bugfix: The generation of the fasta file would fail for frameshift mutations with a range position. This is now fixed.</li>
 <li>Bugfix: Previously a run might fail if certain intermediate files weren't created.</li>
 <li>Bugfix: Using <code>.</code> in the output directory name and the sample name would previously result in errors. This has now been fixed.</li>
 <li>Bugfix: Using a relative directory path for the output directory would previsouly result in an error. This is now fixed.</li>

diff --git a/pvacseq/lib/convert_vcf.py b/pvacseq/lib/convert_vcf.py
@@ -84,7 +84,7 @@ def main(args_input = sys.argv[1:]):
         reference  = entry.REF
         alts       = entry.ALT
 
-        alleles_dict = resolve_alleles(entry);
+        alleles_dict = resolve_alleles(entry)
         for alt in alts:
             alt = str(alt)
             csq_allele = alleles_dict[alt]

diff --git a/pvacseq/lib/generate_fasta.py b/pvacseq/lib/generate_fasta.py
@@ -3,6 +3,8 @@
 import re
 import sys
 
+csv.field_size_limit(sys.maxsize)
+
 def position_out_of_bounds(position, sequence):
     return position > len(sequence)-1
 
@@ -12,7 +14,7 @@ def distance_from_start(position, string):
     return position
 
 def distance_from_end(position, string):
-    return len(string) - 1 - position;
+    return len(string) - 1 - position
 
 def determine_peptide_sequence_length(full_wildtype_sequence_length, peptide_sequence_length, line):
     actual_peptide_sequence_length = peptide_sequence_length
@@ -81,33 +83,27 @@ def main(args_input = sys.argv[1:]):
 
     peptide_sequence_length = args.peptide_sequence_length
     tsvin                   = csv.DictReader(args.input_file, delimiter='\t')
-    pattern                 = re.compile('([A-Z])(\d+)([A-Z])');
+    pattern                 = re.compile('([A-Z])(\d+)([A-Z])')
     for line in tsvin:
         variant_type = line['variant_type']
         full_wildtype_sequence = line['wildtype_amino_acid_sequence']
         if variant_type == 'FS':
-            if '-' in line['protein_position']:
-                position = int(line['protein_position'].split('-', 1)[0]) - 1
-            else:
-                position = int(line['protein_position']) - 1
+            position = int(line['protein_position'].split('-', 1)[0]) - 1
         elif variant_type == 'missense' or variant_type == 'inframe_ins':
             wildtype_amino_acid, mutant_amino_acid = line['amino_acid_change'].split('/')
             if wildtype_amino_acid == '-':
                 position = int(line['protein_position'].split('-', 1)[0])
                 wildtype_amino_acid_length = 0
             else:
-                position = int(line['protein_position']) - 1;
+                position = int(line['protein_position']) - 1
                 wildtype_amino_acid_length = len(wildtype_amino_acid)
         elif variant_type == 'inframe_del':
             variant_type = 'inframe_del'
             wildtype_amino_acid, mutant_amino_acid = line['amino_acid_change'].split('/')
+            position = int(line['protein_position'].split('-', 1)[0]) - 1
+            wildtype_amino_acid_length = len(wildtype_amino_acid)
             if mutant_amino_acid == '-':
-                position = int(line['protein_position']) - 1;
-                wildtype_amino_acid_length = len(wildtype_amino_acid)
-                mutant_amino_acid = '';
-            else:
-                position = int(line['protein_position'].split('-', 1)[0]) - 1
-                wildtype_amino_acid_length = len(wildtype_amino_acid)
+                mutant_amino_acid = ''
         else:
             continue
 
@@ -120,7 +116,7 @@ def main(args_input = sys.argv[1:]):
         else:
             mutation_start_position, wildtype_subsequence = get_wildtype_subsequence(position, full_wildtype_sequence, wildtype_amino_acid_length, peptide_sequence_length, line)
             mutation_end_position = mutation_start_position + wildtype_amino_acid_length
-            mutant_subsequence = wildtype_subsequence[:mutation_start_position] + mutant_amino_acid + wildtype_subsequence[mutation_end_position:];
+            mutant_subsequence = wildtype_subsequence[:mutation_start_position] + mutant_amino_acid + wildtype_subsequence[mutation_end_position:]
 
         variant_id = line['index']
         for designation, subsequence in zip(['WT', 'MT'], [wildtype_subsequence, mutant_subsequence]):

diff --git a/pvacseq/lib/generate_fasta_key.py b/pvacseq/lib/generate_fasta_key.py
@@ -16,7 +16,7 @@ def main(args_input = sys.argv[1:]):
     tsvout = csv.writer(tmp_output_filehandle, delimiter='\t', lineterminator='\n')
 
     i = 1
-    pattern = re.compile('>');
+    pattern = re.compile('>')
     for line in args.input_file:
         match = pattern.match(line)
         if match is not None:

diff --git a/pvacseq/lib/main.py b/pvacseq/lib/main.py
@@ -237,7 +237,8 @@ def main(args_input = sys.argv[1:]):
         sys.exit("The fasta file is empty. Please check that the input VCF contains missense, inframe indel, or frameshift mutations.")
 
     tmp_dir = os.path.join(args.output_dir, 'tmp')
-    os.makedirs(tmp_dir)
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
     chunks                    = split_fasta_file_and_create_key_files(args, fasta_file_path, tmp_dir)
     split_parsed_output_files = call_iedb_and_parse_outputs(args, chunks, tsv_file_path, tmp_dir)
 

diff --git a/pvacseq/lib/parse_output.py b/pvacseq/lib/parse_output.py
@@ -12,6 +12,8 @@
 from statistics import median
 from lib import pvacseq_utils
 
+csv.field_size_limit(sys.maxsize)
+
 def prediction_method_lookup(prediction_method):
     prediction_method_lookup_dict = pvacseq_utils.iedb_to_prediction_method_lookup_dict()
     return prediction_method_lookup_dict[prediction_method]

diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 
 setup(
     name="pvacseq",
-    version="3.0.4",
+    version="3.0.5",
     packages=["pvacseq", "pvacseq.lib"],
     entry_points={
         "console_scripts":[

diff --git a/tests/test_data/generate_fasta/input_inframe_deletion_range.tsv b/tests/test_data/generate_fasta/input_inframe_deletion_range.tsv
@@ -0,0 +1,3 @@
+chromosome_name	start	stop	reference	variant	gene_name	transcript_name	amino_acid_change	ensembl_gene_id	wildtype_amino_acid_sequence	downstream_amino_acid_sequence	variant_type	protein_position	index
+4	140651584	140651590	TCTGCTG	T	MAML3	ENST00000509479	QQ/-	ENSG00000196782	MGDFAAPAAAANGSSICINSSLNSSLGGAGIGVNNTPNSTPAAPSSNHPAAGGCGGSGGPGGGSAAVPKHSTVVERLRQRIEGCRRHHVNCENRYQQAQVEQLELERRDTVSLYQRTLEQRAKKSGAGTGKQQHPSKPQQDAEAASAEQRNHTLIMLQETVKRKLEGARSPLNGDQQNGACDGNFSPTSKRIRKDISAGMEAINNLPSNMPLPSASPLHQLDLKPSLPLQNSGTHTPGLLEDLSKNGRLPEIKLPVNGCSDLEDSFTILQSKDLKQEPLDDPTCIDTSETSLSNQNKLFSDINLNDQEWQELIDELANTVPEDDIQDLFNEDFEEKKEPEFSQPATETPLSQESASVKSDPSHSPFAHVSMGSPQARPSSSGPPFSTVSTATSLPSVASTPAAPNPASSPANCAVQSPQTPNQAHTPGQAPPRPGNGYLLNPAAVTVAGSASGPVAVPSSDMSPAEQLKQMAAQQQQRAKLMQQKQQQQQQQQQQQQQQQQQQQQQQQQQHSNQTSNWSPLGPPSSPYGAAFTAEKPNSPMMYPQAFNNQNPIVPPMANNLQKTTMNNYLPQNHMNMINQQPNNLGTNSLNKQHNILTYGNTKPLTHFNADLSQRMTPPVANPNKNPLMPYIQQQQQQQQQQQQQQQQQQPPPPQLQAPRAHLSEDQKRLLLMKQKGVMNQPMAYAALPSHGQEQHPVGLPRTTGPMQSSVPPGSGGMVSGASPAGPGFLGSQPQAAIMKQMLIDQRAQLIEQQKQQFLREQRQQQQQQQQQILAEQQLQQSHLPRQHLQPQRNPYPVQQVNQFQGSPQDIAAVRSQAALQSMRTSRLMAQNAGMMGIGPSQNPGTMATAAAQSEMGLAPYSTTPTSQPGMYNMSTGMTQMLQHPNQSGMSITHNQAQGPRQPASGQGVGMVSGFGQSMLVNSAITQQHPQMKGPVGQALPRPQAPPRLQSLMGTVQQGAQSWQQRSLQGMPGRTSGELGPFNNGASYPLQAGQPRLTKQHFPQGLSQSVVDANTGTVRTLNPAAMGRQMMPSLPGQQGTSQARPMVMSGLSQGVPGMPAFSQPPAQQQIPSGSFAPSSQSQAYERNAPQDVSYNYSGDGAGGSFPGLPDGADLVDSIIKGGPGDEWMQELDELFGNP		inframe_del	771-772	MAML3_ENST00000509479_1.inframe_del.771-772QQ/-
+4	140651584	140651590	TCTGCTG	T	MAML3	ENST00000502696	QQ/-	ENSG00000196782	XRAKKSGAGTGKQQHPSKPQQDAEAASAEQRNHTLIMEQHPVGLPRTTGPMQSSVPPGSGGMVSGASPAGPGFLGSQPQAAIMKQMLIDQRAQLIEQQKQQFLREQRQQQQQQQQQILAEQQLQQSHLPRQHLQPQRNPYPVQQVNQFQGSPQDIAAVRSQAALQSMRTSRLMAQNAGMMGIGPSQ		inframe_del	115-116	MAML3_ENST00000502696_1.inframe_del.115-116QQ/-
diff --git a/tests/test_data/generate_fasta/output_inframe_deletion_range.fasta b/tests/test_data/generate_fasta/output_inframe_deletion_range.fasta
@@ -0,0 +1,8 @@
+>WT.MAML3_ENST00000509479_1.inframe_del.771-772QQ/-
+EQRQQQQQQQQQILAEQQLQQS
+>MT.MAML3_ENST00000509479_1.inframe_del.771-772QQ/-
+EQRQQQQQQQILAEQQLQQS
+>WT.MAML3_ENST00000502696_1.inframe_del.115-116QQ/-
+EQRQQQQQQQQQILAEQQLQQS
+>MT.MAML3_ENST00000502696_1.inframe_del.115-116QQ/-
+EQRQQQQQQQILAEQQLQQS
diff --git a/tests/test_generate_fasta.py b/tests/test_generate_fasta.py
@@ -152,6 +152,16 @@ def test_input_file_with_inframe_deletion_amino_acid_deletion_generates_expected
         expected_output_file = os.path.join(self.test_data_dir, 'output_inframe_deletion_aa_deletion.fasta')
         self.assertTrue(cmp(generate_fasta_output_file.name, expected_output_file))
 
+    def test_input_file_with_inframe_deletion_range(self):
+        generate_fasta_input_file  = os.path.join(self.test_data_dir, 'input_inframe_deletion_range.tsv')
+        generate_fasta_output_file = tempfile.NamedTemporaryFile()
+
+        self.assertFalse(call([
+            self.python, self.executable, generate_fasta_input_file, self.peptide_sequence_length, generate_fasta_output_file.name
+        ], shell=False))
+        expected_output_file = os.path.join(self.test_data_dir, 'output_inframe_deletion_range.fasta')
+        self.assertTrue(cmp(generate_fasta_output_file.name, expected_output_file))
+
     def test_input_file_with_frameshift_variant_feature_truncation_generates_expected_file(self):
         generate_fasta_input_file  = os.path.join(self.test_data_dir, 'input_frameshift_variant_feature_truncation.tsv')
         generate_fasta_output_file = tempfile.NamedTemporaryFile()