From f23a0340f5476fa118d2ca3f689905d8383a48c8 Mon Sep 17 00:00:00 2001
From: dcruvolo <davidruvolo51@gmail.com>
Date: Fri, 15 Mar 2024 12:09:01 +0100
Subject: [PATCH] fix: added new mappings; improved new sample processing;
 minor fixes

---
 rd3/novelomics_shipment_processing.py | 108 +++++++++++++++++++-------
 1 file changed, 78 insertions(+), 30 deletions(-)

diff --git a/rd3/novelomics_shipment_processing.py b/rd3/novelomics_shipment_processing.py
index 28838c0..4c0b6a3 100644
--- a/rd3/novelomics_shipment_processing.py
+++ b/rd3/novelomics_shipment_processing.py
@@ -1,9 +1,9 @@
 """Solve-RD Novelomics: new shipment file processing
-FILE: solverd_novelomics_processing.py
+FILE: novelomics_shipment_processing.py
 AUTHOR: David Ruvolo
 CREATED: 2022-11-15
-MODIFIED: 2024-03-04
-PURPOSE: Import new novelomics data
+MODIFIED: 2024-03-14
+PURPOSE: Process new shipment manifest files - register new subjects and samples
 STATUS: stable
 PACKAGES: **see below**
 COMMENTS: NA
@@ -14,7 +14,7 @@
 import operator
 import re
 from dotenv import load_dotenv
-from datatable import dt, f, as_type
+from datatable import dt, f
 from tqdm import tqdm
 
 from rd3tools.molgenis import Molgenis
@@ -70,6 +70,13 @@ def get_wrapped_values(val: str = None):
 shipment_dt = dt.Frame(shipment_raw)
 del shipment_dt['_href']
 
+# if you need to delete unprocessed records due to data errors, then
+# run the following commands
+# rd3_prod.delete_list(
+#     entity='rd3_portal_novelomics_shipment',
+#     entities=shipment_dt['molgenis_id'].to_list()[0]
+# )
+
 # ///////////////////////////////////////////////////////////////////////////////
 
 # ~ 1 ~
@@ -230,24 +237,31 @@ def get_wrapped_values(val: str = None):
 )
 
 # check incoming data, update mappings (if applicable), and rerun
-new_tissue_types = dt.unique(
+incoming_tissue_types = dt.unique(
     shipment_dt[f.tissue_type != None, 'tissue_type']).to_list()[0]
 
-new_tissue_types.sort(key=str.lower)
-for value in new_tissue_types:
+incoming_tissue_types.sort(key=str.lower)
+for value in incoming_tissue_types:
     if value.lower() not in tissue_type_mappings:
         print(f"Value '{value}' not in tissue type mappings")
 
+# update mappings for cases that are simple recodes
 tissue_type_mappings.update({
+    'adipose tissue': 'Adipose',
     'blood': 'Whole Blood',
     'cell pellet': 'Cells',
+    'exelid': 'Eyelid',
+    'fat skin': 'Adipose - Subcutaneous',
     'fibroblasts': 'Cells - Cultured fibroblasts',
+    "fetus skin": "Foetus",
     'fetus': 'Foetus',
     'ffpe': 'Tumor',
     'heart': 'Heart',
     'muscle': 'Muscle - Skeletal',
     'pbmc': 'Peripheral Blood Mononuclear Cells',
-    'whole blood': 'Whole Blood'
+    'whole blood': 'Whole Blood',
+    'subcutaneous fat': 'Adipose - Subcutaneous',
+    'tissue': 'Tissue - unspecified',
 })
 
 # ///////////////////////////////////////
@@ -256,11 +270,21 @@ def get_wrapped_values(val: str = None):
 # Create anatomical location mappings
 
 if 'anatomical_location' in shipment_dt.names:
-    print('Checking anatomical location mappings....')
+    print('Manually check anatomical location mappings!')
 
     # As of 06 Dec 2022, the value 'blood' can be ignored as it cannot be mapped
-    # to a more specific term
+    # to a more specific term.
+    # As of 15 March 2024, terms that do not exist in RD3 will be labelled other.
+    # The original value will be placed in a new column. This was implemented as
+    # it isn't possible to determine the exact location from the supplied value.
     anatomical_location_mappings = {
+        'chest': '74964007',  # Other
+        'left': '74964007',  # Other
+        'nose': '74964007',  # Other
+        'retro right auricular area': '74964007',  # Other
+        'right': '74964007',  # Other
+        'scalp': '74964007',  # Other
+
         'chest skin': '74160004',  # Skin of Chest
         'skin scalp': '43067004',  # Skin of Scalp
         # Entire skin of postauricular region
@@ -295,8 +319,7 @@ def get_wrapped_values(val: str = None):
 )['id']
 
 material_types['mappingID'] = dt.Frame([
-    value.lower()
-    for value in material_types['id'].to_list()[0]
+    value.lower() for value in material_types['id'].to_list()[0]
 ])
 
 material_type_mappings = as_key_pairs(
@@ -350,7 +373,10 @@ def get_wrapped_values(val: str = None):
                 f"Value '{value}' does not exist in pathological state mappings")
 
   # if there are any values, enter them below ->
-  # pathological_state_mappings.update({ ... })
+    pathological_state_mappings.update({
+        'affected area': 'Affected',
+        'safe area': 'Normal'
+    })
 
 # ///////////////////////////////////////////////////////////////////////////////
 
@@ -396,7 +422,7 @@ def get_wrapped_values(val: str = None):
 
 # recode anatomical location (if available)
 if 'anatomical_location' in shipment_dt.names:
-    shipment_dt['anatomical_location'] = dt.Frame([
+    shipment_dt['tmp_anatomical_location'] = dt.Frame([
         recode_value(
             mappings=anatomical_location_mappings,
             value=value.lower(),
@@ -405,6 +431,15 @@ def get_wrapped_values(val: str = None):
         for value in shipment_dt['anatomical_location'].to_list()[0]
     ])
 
+    # identifier cases with "other"
+    shipment_dt['anatomical_location_comment'] = dt.Frame([
+        row[1] if row[0] == "74964007" else None
+        for row in shipment_dt[:, ['tmp_anatomical_location', 'anatomical_location']].to_tuples()
+    ])
+
+    shipment_dt['anatomical_location'] = shipment_dt['tmp_anatomical_location']
+    del shipment_dt['tmp_anatomical_location']
+
 # recode sample types (i.e., materialType)
 shipment_dt['sample_type'] = dt.Frame([
     'TISSUE (FFPE)'
@@ -531,7 +566,9 @@ def get_wrapped_values(val: str = None):
     'tissue_type': 'tissueType',
     'sample_type': 'materialType',
     'pathological_state': 'pathological_state',
-    'tumor_cell_fraction': 'percentageTumorCells'
+    'tumor_cell_fraction': 'percentageTumorCells',
+    'anatomical_location': 'anatomicalLocation',
+    'anatomical_location_comment': 'anatomicalLocationComment',
 }
 
 # ///////////////////////////////////////////////////////////////////////////////
@@ -606,15 +643,18 @@ def get_wrapped_values(val: str = None):
     curr_sample = dt_as_recordset(samples_dt[f.sampleID == sample_id, :])[0]
 
     # identify records that require manually verification
-    # for column in columns_with_major_conflicts:
-    #   if (column in incomingSample) and (column in existingSample):
-    #     if incomingSample[column] != existingSample[column]:
-    #       print(f"Incoming sample {id} has conflicting {column} values")
-    #       samples_with_conflicts.append({
-    #         'incomingValue': incomingSample[column],
-    #         'existingValue': existingSample[column],
-    #         'message': f"values in {column} do not match"
-    #       })
+    for column in columns_with_major_conflicts:
+        if (column in new_sample) and (column in curr_sample):
+            if new_sample[column] is not None and curr_sample[column] is not None:
+                if new_sample[column] not in curr_sample[column]:
+                    new_row = {
+                        'sampleID': sample_id,
+                        'subjectID': new_sample['subjectID'],
+                    }
+                    curr_values = curr_sample[column].split(',')
+                    curr_values.append(new_sample[column])
+                    new_row[column] = ','.join(list(set(curr_values)))
+                    samples_with_conflicts.append(new_row)
 
     # identify columns that can automatically imported
     for column in columns_with_minor_conflicts:
@@ -661,14 +701,18 @@ def get_wrapped_values(val: str = None):
 # ~ 3a ~
 # Import new subject metadata
 new_subjects_dt = shipment_dt[
-    f.isNewSubject, (
+    f.isNewSubject,
+    (
         f.subjectID,
         f.organisation,
         f.ERN,
         f.partOfRelease,
         f.dateRecordCreated,
-        f.recordCreatedBy)]
+        f.recordCreatedBy
+    )
+]
 
+# if there are no more subjects, then you can skip to 3b
 if not new_subjects_dt.nrows:
     print('No subjects to import. You may skip this step')
 
@@ -687,7 +731,7 @@ def get_wrapped_values(val: str = None):
 
 # ///////////////////////////////////////
 
-# ~ 2b ~
+# ~ 3b ~
 # Import new sample metadata
 new_samples_dt = shipment_dt[
     f.isNewSample, (
@@ -698,6 +742,8 @@ def get_wrapped_values(val: str = None):
         f.materialType,
         f.pathological_state,
         f.percentageTumorCells,
+        f.anatomicalLocation,
+        f.anatomicalLocationComment,
         f.partOfRelease,
         f.batch,
         f.organisation,
@@ -705,12 +751,14 @@ def get_wrapped_values(val: str = None):
         f.dateRecordCreated,
         f.recordCreatedBy)]
 
+new_samples_dt['retracted'] = 'N'
 new_samples_dt.names = {'subjectID': 'belongsToSubject'}
+
 rd3_prod.import_dt('solverd_samples', new_samples_dt)
 
 # ///////////////////////////////////////
 
-# ~ 2b ~
+# ~ 3c ~
 # Update subject release information
 
 existing_subjects_dt = shipment_dt[f.isNewSubject == False, :]
@@ -743,7 +791,7 @@ def get_wrapped_values(val: str = None):
 
 # ///////////////////////////////////////
 
-# ~ 2c ~
+# ~ 3d ~
 # Update release and batch info in the samples table
 
 if bool(samples_with_updates):
@@ -789,7 +837,7 @@ def get_wrapped_values(val: str = None):
 
 # ///////////////////////////////////////
 
-# ~ 2d ~
+# ~ 3e ~
 # Update processed status in the portal table
 
 processed_ids = []