Merge branch 'griffithlab:master' into master

griffithlab · Oct 27, 2023 · 8ce356e · 8ce356e
2 parents 23c0908 + 60dcec2
commit 8ce356e
Show file tree

Hide file tree

Showing 11 changed files with 2,862 additions and 2,836 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -70,7 +70,7 @@
 # The short X.Y version.
 version = '4.0'
 # The full version, including alpha/beta/rc tags.
-release = '4.0.3'
+release = '4.0.5'
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation

diff --git a/docs/index.rst b/docs/index.rst
@@ -56,8 +56,15 @@ New in Release |release|
 
 This is a bugfix release. It fixes the following problem(s):
 
-- The fixes in issue in the reference proteome similarity step in pVACseq
-  where running with non-human data would cause an error.
+- In recent releases, users have noticed that at some point during pipeline
+  runs, predictions to MHCflurry would hang or get killed. We were able to
+  determine that the cause was related to
+  `PR 988 <https://github.com/griffithlab/pVACtools/pull/988>`_.
+  This PR originally updated calls to MHCflurry to happen by instantiating
+  their predictor within Python instead of calling it on the command line.
+  However, we suspect that this causes a substantial increase in memory usage
+  resulting in the observed behavior. This release reverts the change from PR
+  988.
 
 New in Version |version|
 ------------------------

diff --git a/docs/releases/4_0.rst b/docs/releases/4_0.rst
@@ -106,3 +106,26 @@ This is a bugfix release. It fixes the following problem(s):
 
 - The fixes in issue in the reference proteome similarity step in pVACseq
   where running with non-human data would cause an error.
+
+New in Version 4.0.4
+--------------------
+
+This is a bugfix release. It fixes the following problem(s):
+
+- This release makes various fixes to allow pVACtools to run with non-human
+  data.
+
+New in Version 4.0.5
+--------------------
+
+This is a bugfix release. It fixes the following problem(s):
+
+- In recent releases, users have noticed that at some point during pipeline
+  runs, predictions to MHCflurry would hang or get killed. We were able to
+  determine that the cause was related to
+  `PR 988 <https://github.com/griffithlab/pVACtools/pull/988>`_.
+  This PR originally updated calls to MHCflurry to happen by instantiating
+  their predictor within Python instead of calling it on the command line.
+  However, we suspect that this causes a substantial increase in memory usage
+  resulting in the observed behavior. This release reverts the change from PR
+  988.
diff --git a/pvactools/lib/calculate_reference_proteome_similarity.py b/pvactools/lib/calculate_reference_proteome_similarity.py
@@ -269,13 +269,7 @@ def _input_tsv_type(self, line):
     def _get_full_peptide(self, line, mt_records_dict, wt_records_dict):
         for record_id in mt_records_dict.keys():
             (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
-            transcript_regex = '^.*(ENS[0-9|A-Z|.]+)$'
-            transcript_p = re.compile(transcript_regex)
-            m = transcript_p.match(rest_record_id)
-            if m:
-                transcript = m.group(1)
-            else:
-                raise Exception("Unexpected record_id format: {}".format(record_id))
+            (count, gene, transcript) = rest_record_id.split(".", 2)
             (parsed_aa_change, pos, wt_aa, mt_aa) = index_to_aggregate_report_aa_change(aa_change, variant_type)
             if line['Best Transcript'] == transcript and line['AA Change'] == parsed_aa_change:
                 return (mt_records_dict[record_id], wt_records_dict[record_id], variant_type, mt_aa, wt_aa)

diff --git a/pvactools/lib/prediction_class.py b/pvactools/lib/prediction_class.py
@@ -13,11 +13,6 @@
 from Bio import SeqIO
 import random
 import uuid
-from mhcflurry.downloads import get_default_class1_presentation_models_dir
-from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor
-import numpy
-
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
 class IEDB(metaclass=ABCMeta):
     @classmethod
@@ -323,28 +318,29 @@ def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb
 
         all_epitopes = list(set(all_epitopes))
         if len(all_epitopes) > 0:
-            models_dir = get_default_class1_presentation_models_dir(test_exists=True)
-            predictor = Class1PresentationPredictor.load(models_dir)
-            df = predictor.predict(
-                peptides=numpy.array(all_epitopes, dtype='object'),
-                n_flanks=None,
-                c_flanks=None,
-                alleles={allele: [allele]},
-                throw=True,
-                include_affinity_percentile=True,
-                verbose=0
-            )
+            tmp_output_file = tempfile.NamedTemporaryFile('r', dir=tmp_dir, delete=False)
+            arguments = ["mhcflurry-predict", "--alleles", allele, "--out", tmp_output_file.name, "--peptides"]
+            arguments.extend(all_epitopes)
+            stderr_fh = tempfile.NamedTemporaryFile('w', dir=tmp_dir, delete=False)
+            try:
+                response = run(arguments, check=True, stdout=DEVNULL, stderr=stderr_fh)
+            except:
+                stderr_fh.close()
+                with open(stderr_fh.name, 'r') as fh:
+                    err = fh.read()
+                os.unlink(stderr_fh.name)
+                raise Exception("An error occurred while calling MHCflurry:\n{}".format(err))
+            stderr_fh.close()
+            os.unlink(stderr_fh.name)
+            tmp_output_file.close()
+            df = pd.read_csv(tmp_output_file.name)
+            os.unlink(tmp_output_file.name)
             df.rename(columns={
-                'prediction': 'ic50',
-                'affinity': 'ic50',
-                'prediction_percentile': 'percentile',
-                'affinity_percentile': 'percentile',
-                'processing_score': 'mhcflurry_processing_score',
-                'presentation_score': 'mhcflurry_presentation_score',
-                'presentation_percentile': 'mhcflurry_presentation_percentile',
-                'best_allele': 'allele',
+                'mhcflurry_prediction': 'ic50',
+                'mhcflurry_affinity': 'ic50',
+                'mhcflurry_prediction_percentile': 'percentile',
+                'mhcflurry_affinity_percentile': 'percentile'
             }, inplace=True)
-            df.drop(labels='peptide_num', axis=1, inplace=True)
             for record in SeqIO.parse(input_file, "fasta"):
                 seq_num = record.id
                 peptide = str(record.seq)

diff --git a/pvactools/tools/pvacseq/generate_protein_fasta.py b/pvactools/tools/pvacseq/generate_protein_fasta.py
@@ -166,13 +166,7 @@ def parse_files(output_file, temp_dir, mutant_only, input_tsv, aggregate_report_
                         continue
                 else:
                     (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
-                    transcript_regex = '^.*(ENST[0-9|.]+)$'
-                    transcript_p = re.compile(transcript_regex)
-                    m = transcript_p.match(rest_record_id)
-                    if m:
-                        transcript = m.group(1)
-                    else:
-                        raise Exception("Unexpected record_id format: {}".format(record_id))
+                    (peptide_type, count, gene, transcript) = rest_record_id.split(".", 3)
                     (parsed_aa_change, _, _, _) = index_to_aggregate_report_aa_change(aa_change, variant_type)
                     matches = [i for i in tsv_indexes if i['Best Transcript'] == transcript and i['AA Change'] == parsed_aa_change and i['Evaluation'] in aggregate_report_evaluation]
                     if len(matches) == 0:

diff --git a/pvactools/tools/pvacview/server.R b/pvactools/tools/pvacview/server.R
@@ -96,7 +96,13 @@ server <- shinyServer(function(input, output, session) {
     df$allele_specific_anchors <- df$metricsData$`allele_specific_anchors`
     df$anchor_contribution <- df$metricsData$`anchor_contribution_threshold`
     hla <- df$metricsData$alleles
-    converted_hla_names <- unlist(lapply(hla, function(x) {strsplit(x, "HLA-")[[1]][2]}))
+    converted_hla_names <- unlist(lapply(hla, function(x) {
+      if (grepl("HLA-", x)) {
+        strsplit(x, "HLA-")[[1]][2]
+      } else {
+        x
+      }
+    }))
     if (!("Ref Match" %in% colnames(df$mainTable))) {
       df$mainTable$`Ref Match` <- "Not Run"
     }
@@ -172,7 +178,13 @@ server <- shinyServer(function(input, output, session) {
      df$allele_specific_anchors <- df$metricsData$`allele_specific_anchors`
      df$anchor_contribution <- df$metricsData$`anchor_contribution_threshold`
      hla <- df$metricsData$alleles
-     converted_hla_names <- unlist(lapply(hla, function(x) {strsplit(x, "HLA-")[[1]][2]}))
+     converted_hla_names <- unlist(lapply(hla, function(x) {
+       if (grepl("HLA-", x)) {
+         strsplit(x, "HLA-")[[1]][2]
+       } else {
+         x
+       }
+     }))
      if (!("Ref Match" %in% colnames(df$mainTable))) {
        df$mainTable$`Ref Match` <- "Not Run"
      }

diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="pvactools",
-    version="4.0.3",
+    version="4.0.5",
     packages=[
         "pvactools.tools",
         "pvactools.tools.pvacbind",

diff --git a/tests/test_call_iedb.py b/tests/test_call_iedb.py
@@ -96,8 +96,8 @@ def test_mhcflurry_method_generates_expected_files(self):
         ])
         if sys.platform == 'darwin':
             expected_output_file = os.path.join(self.test_data_dir, 'output_mhcflurry_osx.tsv')
-            expected_df = pd.read_csv(expected_output_file, sep="\t", index_col=[0,8,9])
-            actual_df = pd.read_csv(call_iedb_output_file.name, sep="\t", index_col=[0,8,9])
+            expected_df = pd.read_csv(expected_output_file, sep="\t", index_col=[1,7,8])
+            actual_df = pd.read_csv(call_iedb_output_file.name, sep="\t", index_col=[1,7,8])
             pd.testing.assert_frame_equal(expected_df, actual_df, check_like=True, check_exact=False, rtol=0.05)
 
     def test_mhcnuggetsi_method_generates_expected_files(self):

diff --git a/tests/test_data/calculate_reference_proteome_similarity/Test.mouse.fasta b/tests/test_data/calculate_reference_proteome_similarity/Test.mouse.fasta
@@ -1,4 +1,4 @@
->WT.Rp1.ENSMUST00000027032.missense.1453N/S
+>WT.1.Rp1.ENSMUST00000027032.missense.1453N/S
 IAGTLKFNPETDYLTGTDG
->MT.Rp1.ENSMUST00000027032.missense.1453N/S
+>MT.1.Rp1.ENSMUST00000027032.missense.1453N/S
 IAGTLKFNPQTDYLTGTDG