VEuPathDB · dmgaldi · Apr 23, 2024 · Mar 27, 2024 · Apr 2, 2024 · Apr 10, 2024
diff --git a/Model/pom.xml b/Model/pom.xml
@@ -110,6 +110,11 @@
       <artifactId>commons-lang3</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>

diff --git a/...src/main/java/org/gusdb/wdk/model/fix/table/edaanalysis/plugins/VDIEntityIdRetriever.java b/...src/main/java/org/gusdb/wdk/model/fix/table/edaanalysis/plugins/VDIEntityIdRetriever.java
@@ -0,0 +1,30 @@
+package org.gusdb.wdk.model.fix.table.edaanalysis.plugins;
+
+import org.gusdb.fgputil.db.runner.SQLRunner;
+
+import javax.sql.DataSource;
+import java.util.Optional;
+
+public class VDIEntityIdRetriever {
+  private DataSource eda;
+  private String schema;
+
+  public VDIEntityIdRetriever(DataSource eda, String schema) {
+    this.eda = eda;
+    this.schema = schema;
+  }
+
+  public Optional<String> queryEntityId(String vdiStableId) {
+    final String sql = String.format("SELECT internal_abbrev FROM %s.userstudydatasetid u" +
+        " JOIN %s.entitytypegraph etg" +
+        " ON u.study_stable_id = etg.study_stable_id" +
+        " WHERE dataset_stable_id = ?", schema, schema);
+    return new SQLRunner(eda, sql).executeQuery(new Object[] { vdiStableId }, rs -> {
+      boolean hasNext = rs.next();
+      if (!hasNext) {
+        return Optional.empty();
+      }
+      return Optional.ofNullable(rs.getString("internal_abbrev"));
+    });
+  }
+}
diff --git a/...l/src/main/java/org/gusdb/wdk/model/fix/table/edaanalysis/plugins/VDIMigrationPlugin.java b/...l/src/main/java/org/gusdb/wdk/model/fix/table/edaanalysis/plugins/VDIMigrationPlugin.java
@@ -1,89 +1,153 @@
 package org.gusdb.wdk.model.fix.table.edaanalysis.plugins;
 
 import com.fasterxml.jackson.databind.JsonNode;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.log4j.Logger;
 import org.gusdb.fgputil.json.JsonUtil;
 import org.gusdb.wdk.model.WdkModel;
-import org.gusdb.wdk.model.fix.VdiMigrationFileReader;
 import org.gusdb.wdk.model.fix.table.TableRowInterfaces;
 import org.gusdb.wdk.model.fix.table.edaanalysis.AbstractAnalysisUpdater;
 import org.gusdb.wdk.model.fix.table.edaanalysis.AnalysisRow;
+import org.json.JSONObject;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.regex.Pattern;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 
 public class VDIMigrationPlugin extends AbstractAnalysisUpdater {
   private static final Logger LOG = Logger.getLogger(VDIMigrationPlugin.class);
-  public static final String UD_DATASET_ID_PREFIX = "EDAUD_";
+  private static final String UD_DATASET_ID_PREFIX = "EDAUD_";
+  private static final Pattern VAR_ID_PATTERN = Pattern.compile("variableId\":\\s*\"([a-zA-Z0-9_-]+)");
+  private static final Pattern ENTITY_ID_PATTERN = Pattern.compile("entityId\":\\s*\"([a-zA-Z0-9_-]+)");
 
-  private Map<String, String> legacyIdToVdiId;
+  private Map<String, String> _legacyIdToVdiId;
+  private VDIEntityIdRetriever _vdiEntityIdRetriever;
   private final AtomicInteger missingFromVdiCount = new AtomicInteger(0);
 
+  @Override
+  public void configure(WdkModel wdkModel, List<String> additionalArgs) throws Exception {
+    // Parse args in the format --<argname>=<argvalue>
+    final Map<String, String> args = additionalArgs.stream()
+        .map(arg -> Arrays.stream(arg.split("="))
+            .map(String::trim) // Trim whitespace from args
+            .collect(Collectors.toList()))
+        .collect(Collectors.toMap(
+            pair -> pair.get(0),
+            pair -> pair.size() > 1 ? pair.get(1) : "true")); // A flag without an "=" is a boolean. Set true if present.
+
+    // Validate required args.
+    if (!args.containsKey("--tinyDb")) {
+      throw new IllegalArgumentException("Missing required flag --tinyDb");
+    }
+    if (!args.containsKey(("--schema"))) {
+      throw new IllegalArgumentException("Missing required argument --schema");
+    }
+
+    final String schema = args.get("--schema");
+    setEntityIdRetriever(new VDIEntityIdRetriever(wdkModel.getAppDb().getDataSource(), schema));
+
+    final File tinyDbFile = new File(args.get("--tinyDb"));
+    readVdiMappingFile(tinyDbFile);
+
+    // Default to dryrun to avoid incidental migrations when testing.
+    _writeToDb = Boolean.parseBoolean(args.getOrDefault("--write", "false"));
+    _wdkModel = wdkModel;
+  }
+
+  // Visible for testing.
+  void setEntityIdRetriever(VDIEntityIdRetriever entityIdRetriever) {
+    _vdiEntityIdRetriever = entityIdRetriever;
+  }
+
+  // Visible for testing
+  void readVdiMappingFile(File mappingFile) {
+    _legacyIdToVdiId = readLegacyStudyIdToVdiId(mappingFile);
+  }
+
   @Override
   public TableRowInterfaces.RowResult<AnalysisRow> processRecord(AnalysisRow nextRow) throws Exception {
     final String legacyDatasetId = nextRow.getDatasetId();
+
     if (!legacyDatasetId.startsWith(UD_DATASET_ID_PREFIX)) {
-      return new TableRowInterfaces.RowResult<>(nextRow)
-          .setShouldWrite(false);
+      return new TableRowInterfaces.RowResult<>(nextRow).setShouldWrite(false);
     }
 
     final String legacyUdId = legacyDatasetId.replace(UD_DATASET_ID_PREFIX, "");
-    final String vdiId = legacyIdToVdiId.get(legacyUdId);
+    final String vdiId = _legacyIdToVdiId.get(legacyUdId);
 
     if (vdiId == null) {
       LOG.warn("Unable to find legacy ID " + legacyUdId + " in the tinydb file.");
       missingFromVdiCount.incrementAndGet();
-      return new TableRowInterfaces.RowResult<>(nextRow);
+      return new TableRowInterfaces.RowResult<>(nextRow)
+          .setShouldWrite(false);
     }
 
     // Append UD prefix to VDI ID. The prefix is prepended in the view that maps stable VDI IDs to the unstable study
     // ID, which is the currency of EDA.
     final String vdiDatasetId = UD_DATASET_ID_PREFIX + vdiId;
+    final Optional<String> vdiEntityId = _vdiEntityIdRetriever.queryEntityId(vdiDatasetId);
+    if (vdiEntityId.isEmpty()) {
+      LOG.warn("Unable to find entity ID in appdb for VDI dataset ID: " + vdiDatasetId);
+      return new TableRowInterfaces.RowResult<>(nextRow)
+          .setShouldWrite(false);
+    }
+
+    LOG.info("Analysis descriptor before migration: " + nextRow.getDescriptor());
+    String descriptor = nextRow.getDescriptor().toString();
+
+    // Find all variable IDs.
+    final Set<String> legacyVariableIds = VAR_ID_PATTERN.matcher(descriptor).results()
+        .map(match -> match.group(1))
+        .collect(Collectors.toSet());
+
+    final String entityId = ENTITY_ID_PATTERN.matcher(descriptor).results()
+        .findAny()
+        .map(m -> m.group(1))
+        .orElse(null);
+
+    // Replace all entityID with entityID looked up from database.
+    if (entityId != null) {
+      descriptor = descriptor.replaceAll(entityId, vdiEntityId.get());
+    }
+
+    // Replace all variable IDs with value converted from legacy variable ID.
+    for (String legacyVariableId: legacyVariableIds) {
+      descriptor = descriptor.replaceAll(legacyVariableId, convertToVdiId(legacyVariableId));
+    }
+
+    // Create a copy with just the dataset ID updated to VDI counterpart.
+    nextRow.setDescriptor(new JSONObject(descriptor));
     nextRow.setDatasetId(vdiDatasetId);
 
+    LOG.info("Analysis descriptor after migration: " + descriptor);
+
     return new TableRowInterfaces.RowResult<>(nextRow)
         .setShouldWrite(_writeToDb);
   }
 
+  private String convertToVdiId(String legacyVariableId) {
+    byte[] encodedId = DigestUtils.digest(DigestUtils.getSha1Digest(), legacyVariableId.getBytes(StandardCharsets.UTF_8));
+    return "VAR_" + Hex.encodeHexString(encodedId).substring(0, 16);
+  }
+
   @Override
   public void dumpStatistics() {
     if (missingFromVdiCount.get() > 0) {
       LOG.warn("Failed to migrate " + missingFromVdiCount + " datasets, they were not found in the provided tinydb file.");
     }
   }
 
-  @Override
-  public void configure(WdkModel wdkModel, List<String> additionalArgs) throws Exception {
-    // Parse args in the format --<argname>=<argvalue>
-    final Map<String, String> args = additionalArgs.stream()
-        .map(arg -> Arrays.stream(arg.split("="))
-            .map(String::trim) // Trim whitespace from args
-            .collect(Collectors.toList()))
-        .collect(Collectors.toMap(
-            pair -> pair.get(0),
-            pair -> pair.size() > 1 ? pair.get(1) : "true")); // A flag without an "=" is a boolean. Set true if present.
-
-    // Validate required arg.
-    if (!args.containsKey("--tinyDb")) {
-      throw new IllegalArgumentException("Missing required flag --tinyDb");
-    }
-
-    final File tinyDbFile = new File(args.get("--tinyDb"));
-    VdiMigrationFileReader reader = new VdiMigrationFileReader(tinyDbFile);
-
-    this.legacyIdToVdiId = reader.readLegacyStudyIdToVdiId();
-
-    // Default to dryrun to avoid incidental migrations when testing.
-    this._writeToDb = Boolean.parseBoolean(args.getOrDefault("--liveRun", "false"));
-  }
-
   /**
    * Parse the tinydb file into a map of legacy UD identifiers to VDI identifiers.
    *

diff --git a/...c/test/java/org/gusdb/wdk/model/fix/table/edaanalysis/plugins/VDIMigrationPluginTest.java b/...c/test/java/org/gusdb/wdk/model/fix/table/edaanalysis/plugins/VDIMigrationPluginTest.java
@@ -10,50 +10,61 @@
 import org.mockito.Mockito;
 
 import java.io.File;
-import java.util.List;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Objects;
+import java.util.Optional;
 
 public class VDIMigrationPluginTest {
   private WdkModel mockedModel;
   private ClassLoader classLoader;
+  private VDIEntityIdRetriever retriever;
 
   @Before
   public void setup() {
     classLoader = getClass().getClassLoader();
     mockedModel = Mockito.mock(WdkModel.class);
+    retriever = Mockito.mock(VDIEntityIdRetriever.class);
   }
 
   @Test
   public void testUpdateEnabled() throws Exception {
+    File analysisFile = new File(Objects.requireNonNull(classLoader.getResource("analysis-unit-test-1.json")).getFile());
+    JSONObject descriptor = new JSONObject(Files.readString(Path.of(analysisFile.getPath())));
     final File file = new File(Objects.requireNonNull(classLoader.getResource("migration-unit-test-1.json")).getFile());
     final VDIMigrationPlugin migrationPlugin = new VDIMigrationPlugin();
-    final List<String> args = List.of("--tinyDb=" + file.getPath());
-    migrationPlugin.configure(mockedModel, args);
+    Mockito.when(retriever.queryEntityId("EDAUD_123XyZ")).thenReturn(Optional.of("asdf"));
+    migrationPlugin.readVdiMappingFile(file);
+    migrationPlugin.setEntityIdRetriever(retriever);
     TableRowInterfaces.RowResult<AnalysisRow> result = migrationPlugin.processRecord(
         new AnalysisRow("x",
             "EDAUD_1234",
-            new JSONObject(),
+            descriptor,
             3,
             4,
             5));
     Assert.assertEquals("EDAUD_123XyZ", result.getRow().getDatasetId());
+    Assert.assertTrue(result.getRow().getDescriptor().toString().contains("VAR_c73e53adb951e2fe"));
     Assert.assertFalse(result.shouldWrite());
   }
 
   @Test
   public void testUpdateDisabled() throws Exception {
+    File analysisFile = new File(Objects.requireNonNull(classLoader.getResource("analysis-unit-test-1.json")).getFile());
+    JSONObject descriptor = new JSONObject(Files.readString(Path.of(analysisFile.getPath())));
     final File file = new File(Objects.requireNonNull(classLoader.getResource("migration-unit-test-1.json")).getFile());
     final VDIMigrationPlugin migrationPlugin = new VDIMigrationPlugin();
-    final List<String> args = List.of("--tinyDb=" + file.getPath(), "--liveRun");
-    migrationPlugin.configure(mockedModel, args);
+    Mockito.when(retriever.queryEntityId("EDAUD_123XyZ")).thenReturn(Optional.of("asdf"));
+    migrationPlugin.readVdiMappingFile(file);
+    migrationPlugin.setEntityIdRetriever(retriever);
     TableRowInterfaces.RowResult<AnalysisRow> result = migrationPlugin.processRecord(
         new AnalysisRow("x",
             "EDAUD_1234",
-            new JSONObject(),
+            descriptor,
             3,
             4,
             5));
     Assert.assertEquals("EDAUD_123XyZ", result.getRow().getDatasetId());
-    Assert.assertTrue(result.shouldWrite());
+    Assert.assertFalse(result.shouldWrite());
   }
 }
diff --git a/Model/src/test/resources/analysis-unit-test-1.json b/Model/src/test/resources/analysis-unit-test-1.json