Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

migrate analyses vdi ids #88

Merged
merged 17 commits into from
Apr 23, 2024
5 changes: 5 additions & 0 deletions Model/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@
<artifactId>commons-lang3</artifactId>
</dependency>

<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>

<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.gusdb.wdk.model.fix.table.edaanalysis.plugins;

import org.gusdb.fgputil.db.runner.SQLRunner;

import javax.sql.DataSource;
import java.util.Optional;

public class VDIEntityIdRetriever {
private DataSource eda;
private String schema;

public VDIEntityIdRetriever(DataSource eda, String schema) {
this.eda = eda;
this.schema = schema;
}

public Optional<String> queryEntityId(String vdiStableId) {
final String sql = String.format("SELECT internal_abbrev FROM %s.userstudydatasetid u" +
" JOIN %s.entitytypegraph etg" +
" ON u.study_stable_id = etg.study_stable_id" +
" WHERE dataset_stable_id = ?", schema, schema);
return new SQLRunner(eda, sql).executeQuery(new Object[] { vdiStableId }, rs -> {
boolean hasNext = rs.next();
if (!hasNext) {
return Optional.empty();
}
return Optional.ofNullable(rs.getString("internal_abbrev"));
});
}
}
Original file line number Diff line number Diff line change
@@ -1,89 +1,153 @@
package org.gusdb.wdk.model.fix.table.edaanalysis.plugins;

import com.fasterxml.jackson.databind.JsonNode;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import org.gusdb.fgputil.json.JsonUtil;
import org.gusdb.wdk.model.WdkModel;
import org.gusdb.wdk.model.fix.VdiMigrationFileReader;
import org.gusdb.wdk.model.fix.table.TableRowInterfaces;
import org.gusdb.wdk.model.fix.table.edaanalysis.AbstractAnalysisUpdater;
import org.gusdb.wdk.model.fix.table.edaanalysis.AnalysisRow;
import org.json.JSONObject;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

public class VDIMigrationPlugin extends AbstractAnalysisUpdater {
private static final Logger LOG = Logger.getLogger(VDIMigrationPlugin.class);
public static final String UD_DATASET_ID_PREFIX = "EDAUD_";
private static final String UD_DATASET_ID_PREFIX = "EDAUD_";
private static final Pattern VAR_ID_PATTERN = Pattern.compile("variableId\":\\s*\"([a-zA-Z0-9_-]+)");
private static final Pattern ENTITY_ID_PATTERN = Pattern.compile("entityId\":\\s*\"([a-zA-Z0-9_-]+)");

private Map<String, String> legacyIdToVdiId;
private Map<String, String> _legacyIdToVdiId;
private VDIEntityIdRetriever _vdiEntityIdRetriever;
private final AtomicInteger missingFromVdiCount = new AtomicInteger(0);

@Override
public void configure(WdkModel wdkModel, List<String> additionalArgs) throws Exception {
// Parse args in the format --<argname>=<argvalue>
final Map<String, String> args = additionalArgs.stream()
.map(arg -> Arrays.stream(arg.split("="))
.map(String::trim) // Trim whitespace from args
.collect(Collectors.toList()))
.collect(Collectors.toMap(
pair -> pair.get(0),
pair -> pair.size() > 1 ? pair.get(1) : "true")); // A flag without an "=" is a boolean. Set true if present.

// Validate required args.
if (!args.containsKey("--tinyDb")) {
throw new IllegalArgumentException("Missing required flag --tinyDb");
}
if (!args.containsKey(("--schema"))) {
throw new IllegalArgumentException("Missing required argument --schema");
}

final String schema = args.get("--schema");
setEntityIdRetriever(new VDIEntityIdRetriever(wdkModel.getAppDb().getDataSource(), schema));

final File tinyDbFile = new File(args.get("--tinyDb"));
readVdiMappingFile(tinyDbFile);

// Default to dryrun to avoid incidental migrations when testing.
_writeToDb = Boolean.parseBoolean(args.getOrDefault("--write", "false"));
_wdkModel = wdkModel;
}

// Visible for testing.
void setEntityIdRetriever(VDIEntityIdRetriever entityIdRetriever) {
_vdiEntityIdRetriever = entityIdRetriever;
}

// Visible for testing
void readVdiMappingFile(File mappingFile) {
_legacyIdToVdiId = readLegacyStudyIdToVdiId(mappingFile);
}

@Override
public TableRowInterfaces.RowResult<AnalysisRow> processRecord(AnalysisRow nextRow) throws Exception {
final String legacyDatasetId = nextRow.getDatasetId();

if (!legacyDatasetId.startsWith(UD_DATASET_ID_PREFIX)) {
return new TableRowInterfaces.RowResult<>(nextRow)
.setShouldWrite(false);
return new TableRowInterfaces.RowResult<>(nextRow).setShouldWrite(false);
}

final String legacyUdId = legacyDatasetId.replace(UD_DATASET_ID_PREFIX, "");
final String vdiId = legacyIdToVdiId.get(legacyUdId);
final String vdiId = _legacyIdToVdiId.get(legacyUdId);

if (vdiId == null) {
LOG.warn("Unable to find legacy ID " + legacyUdId + " in the tinydb file.");
missingFromVdiCount.incrementAndGet();
return new TableRowInterfaces.RowResult<>(nextRow);
return new TableRowInterfaces.RowResult<>(nextRow)
.setShouldWrite(false);
}

// Append UD prefix to VDI ID. The prefix is prepended in the view that maps stable VDI IDs to the unstable study
// ID, which is the currency of EDA.
final String vdiDatasetId = UD_DATASET_ID_PREFIX + vdiId;
final Optional<String> vdiEntityId = _vdiEntityIdRetriever.queryEntityId(vdiDatasetId);
if (vdiEntityId.isEmpty()) {
LOG.warn("Unable to find entity ID in appdb for VDI dataset ID: " + vdiDatasetId);
return new TableRowInterfaces.RowResult<>(nextRow)
.setShouldWrite(false);
}

LOG.info("Analysis descriptor before migration: " + nextRow.getDescriptor());
String descriptor = nextRow.getDescriptor().toString();

// Find all variable IDs.
final Set<String> legacyVariableIds = VAR_ID_PATTERN.matcher(descriptor).results()
.map(match -> match.group(1))
.collect(Collectors.toSet());

final String entityId = ENTITY_ID_PATTERN.matcher(descriptor).results()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure there's only one here? We could easily have a config with >1 variable spec, each of which might contain a different entity ID, especially across different visualizations.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's only one entity allowed in ISA simple studies

.findAny()
.map(m -> m.group(1))
.orElse(null);

// Replace all entityID with entityID looked up from database.
if (entityId != null) {
descriptor = descriptor.replaceAll(entityId, vdiEntityId.get());
}

// Replace all variable IDs with value converted from legacy variable ID.
for (String legacyVariableId: legacyVariableIds) {
descriptor = descriptor.replaceAll(legacyVariableId, convertToVdiId(legacyVariableId));
}

// Create a copy with just the dataset ID updated to VDI counterpart.
nextRow.setDescriptor(new JSONObject(descriptor));
nextRow.setDatasetId(vdiDatasetId);

LOG.info("Analysis descriptor after migration: " + descriptor);

return new TableRowInterfaces.RowResult<>(nextRow)
.setShouldWrite(_writeToDb);
}

private String convertToVdiId(String legacyVariableId) {
byte[] encodedId = DigestUtils.digest(DigestUtils.getSha1Digest(), legacyVariableId.getBytes(StandardCharsets.UTF_8));
return "VAR_" + Hex.encodeHexString(encodedId).substring(0, 16);
}

@Override
public void dumpStatistics() {
if (missingFromVdiCount.get() > 0) {
LOG.warn("Failed to migrate " + missingFromVdiCount + " datasets, they were not found in the provided tinydb file.");
}
}

@Override
public void configure(WdkModel wdkModel, List<String> additionalArgs) throws Exception {
// Parse args in the format --<argname>=<argvalue>
final Map<String, String> args = additionalArgs.stream()
.map(arg -> Arrays.stream(arg.split("="))
.map(String::trim) // Trim whitespace from args
.collect(Collectors.toList()))
.collect(Collectors.toMap(
pair -> pair.get(0),
pair -> pair.size() > 1 ? pair.get(1) : "true")); // A flag without an "=" is a boolean. Set true if present.

// Validate required arg.
if (!args.containsKey("--tinyDb")) {
throw new IllegalArgumentException("Missing required flag --tinyDb");
}

final File tinyDbFile = new File(args.get("--tinyDb"));
VdiMigrationFileReader reader = new VdiMigrationFileReader(tinyDbFile);

this.legacyIdToVdiId = reader.readLegacyStudyIdToVdiId();

// Default to dryrun to avoid incidental migrations when testing.
this._writeToDb = Boolean.parseBoolean(args.getOrDefault("--liveRun", "false"));
}

/**
* Parse the tinydb file into a map of legacy UD identifiers to VDI identifiers.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,50 +10,61 @@
import org.mockito.Mockito;

import java.io.File;
import java.util.List;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.Optional;

public class VDIMigrationPluginTest {
private WdkModel mockedModel;
private ClassLoader classLoader;
private VDIEntityIdRetriever retriever;

@Before
public void setup() {
classLoader = getClass().getClassLoader();
mockedModel = Mockito.mock(WdkModel.class);
retriever = Mockito.mock(VDIEntityIdRetriever.class);
}

@Test
public void testUpdateEnabled() throws Exception {
File analysisFile = new File(Objects.requireNonNull(classLoader.getResource("analysis-unit-test-1.json")).getFile());
JSONObject descriptor = new JSONObject(Files.readString(Path.of(analysisFile.getPath())));
final File file = new File(Objects.requireNonNull(classLoader.getResource("migration-unit-test-1.json")).getFile());
final VDIMigrationPlugin migrationPlugin = new VDIMigrationPlugin();
final List<String> args = List.of("--tinyDb=" + file.getPath());
migrationPlugin.configure(mockedModel, args);
Mockito.when(retriever.queryEntityId("EDAUD_123XyZ")).thenReturn(Optional.of("asdf"));
migrationPlugin.readVdiMappingFile(file);
migrationPlugin.setEntityIdRetriever(retriever);
TableRowInterfaces.RowResult<AnalysisRow> result = migrationPlugin.processRecord(
new AnalysisRow("x",
"EDAUD_1234",
new JSONObject(),
descriptor,
3,
4,
5));
Assert.assertEquals("EDAUD_123XyZ", result.getRow().getDatasetId());
Assert.assertTrue(result.getRow().getDescriptor().toString().contains("VAR_c73e53adb951e2fe"));
Assert.assertFalse(result.shouldWrite());
}

@Test
public void testUpdateDisabled() throws Exception {
File analysisFile = new File(Objects.requireNonNull(classLoader.getResource("analysis-unit-test-1.json")).getFile());
JSONObject descriptor = new JSONObject(Files.readString(Path.of(analysisFile.getPath())));
final File file = new File(Objects.requireNonNull(classLoader.getResource("migration-unit-test-1.json")).getFile());
final VDIMigrationPlugin migrationPlugin = new VDIMigrationPlugin();
final List<String> args = List.of("--tinyDb=" + file.getPath(), "--liveRun");
migrationPlugin.configure(mockedModel, args);
Mockito.when(retriever.queryEntityId("EDAUD_123XyZ")).thenReturn(Optional.of("asdf"));
migrationPlugin.readVdiMappingFile(file);
migrationPlugin.setEntityIdRetriever(retriever);
TableRowInterfaces.RowResult<AnalysisRow> result = migrationPlugin.processRecord(
new AnalysisRow("x",
"EDAUD_1234",
new JSONObject(),
descriptor,
3,
4,
5));
Assert.assertEquals("EDAUD_123XyZ", result.getRow().getDatasetId());
Assert.assertTrue(result.shouldWrite());
Assert.assertFalse(result.shouldWrite());
}
}
61 changes: 61 additions & 0 deletions Model/src/test/resources/analysis-unit-test-1.json

Large diffs are not rendered by default.

Loading