Skip to content

Commit

Permalink
Merge pull request #280 from JULIELab/taxid_filter
Browse files Browse the repository at this point in the history
Taxid filter
  • Loading branch information
khituras authored Sep 20, 2024
2 parents 59d03f7 + f4c61a7 commit 8db71d9
Show file tree
Hide file tree
Showing 16 changed files with 240 additions and 95 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,17 @@ public class GepiRequestData implements Cloneable {
private boolean includeUnary;
private int eventLikelihood;
private String[] taxId;
private String[] taxIdsA;
private String[] taxIdsB;
private String sectionNameFilterString;
private int pageSize = 10;

public GepiRequestData(List<String> eventTypes, boolean includeUnary, int eventLikelihood, Future<IdConversionResult> listAGePiIds, Future<IdConversionResult> listBGePiIds, String[] taxId, String sentenceFilterString, String paragraphFilterString, String filterFieldsConnectionOperator, String sectionNameFilterString, EnumSet<InputMode> inputMode, String docId, long dataSessionId) {
public GepiRequestData(List<String> eventTypes, boolean includeUnary, int eventLikelihood, Future<IdConversionResult> listAGePiIds, Future<IdConversionResult> listBGePiIds, String[] taxId, String[] taxIdA, String[] taxIdB, String sentenceFilterString, String paragraphFilterString, String filterFieldsConnectionOperator, String sectionNameFilterString, EnumSet<InputMode> inputMode, String docId, long dataSessionId) {
this.includeUnary = includeUnary;
this.eventLikelihood = eventLikelihood;
this.taxId = taxId;
this.taxIdsA = taxIdA;
this.taxIdsB = taxIdB;
this.sectionNameFilterString = sectionNameFilterString;
this.eventTypes = eventTypes;
this.listAGePiIds = listAGePiIds;
Expand Down Expand Up @@ -203,4 +207,12 @@ public String toString() {
", dataSessionId=" + dataSessionId +
'}';
}

public String[] getTaxIdsB() {
return taxIdsB;
}

public String[] getTaxIdsA() {
return taxIdsA;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,19 @@ public static BoolQuery getClosedQuery(GepiRequestData requestData, Set<String>
a1b2Clause.addQuery(listA1Query);
a1b2Clause.addQuery(listB2Query);
a1b2Clause.occur = MUST;
// List B can be empty if there is a taxId filter on the B-side. This is then still a form of a closed search.
// However, this results in a B-query that has an empty list of terms which causes
// ElasticSearch to not retrieve anything.
if (listB2Query.terms.isEmpty())
a1b2Clause.queries.remove(listB2Query);

BoolClause a2b1Clause = new BoolClause();
a2b1Clause.addQuery(listA2Query);
a2b1Clause.addQuery(listB1Query);
a2b1Clause.occur = MUST;
// Analogous to the a1b2Clause above.
if (listB1Query.terms.isEmpty())
a1b2Clause.queries.remove(listB1Query);

BoolQuery a1b2Query = new BoolQuery();
a1b2Query.addClause(a1b2Clause);
Expand Down Expand Up @@ -82,13 +90,22 @@ public static BoolQuery getClosedQuery(GepiRequestData requestData, Set<String>
}
if (requestData.getEventLikelihood() > 1)
addEventLikelihoodFilter(eventQuery, requestData.getEventLikelihood());
if (requestData.getTaxId() != null && requestData.getTaxId().length > 0) {
final TermsQuery taxQuery = new TermsQuery(Arrays.stream(requestData.getTaxId()).collect(Collectors.toList()));
taxQuery.field = FIELD_EVENT_TAX_IDS;
BoolClause taxIdFilterClause = new BoolClause();
taxIdFilterClause.occur = FILTER;
taxIdFilterClause.addQuery(taxQuery);
eventQuery.addClause(taxIdFilterClause);
addTaxIdFilter(eventQuery, requestData.getTaxId(), FIELD_EVENT_TAX_IDS);
if (requestData.getTaxIdsA() != null && requestData.getTaxIdsA().length > 0) {
final TermsQuery taxQuery = new TermsQuery(Arrays.stream(requestData.getTaxIdsA()).collect(Collectors.toList()));
taxQuery.field = FIELD_EVENT_ARG1_TAX_ID;
a1b2Clause.addQuery(taxQuery);
final TermsQuery taxQuery2 = new TermsQuery(Arrays.stream(requestData.getTaxIdsA()).collect(Collectors.toList()));
taxQuery2.field = FIELD_EVENT_ARG2_TAX_ID;
a2b1Clause.addQuery(taxQuery2);
}
if (requestData.getTaxIdsB() != null && requestData.getTaxIdsB().length > 0) {
final TermsQuery taxQuery = new TermsQuery(Arrays.stream(requestData.getTaxIdsB()).collect(Collectors.toList()));
taxQuery.field = FIELD_EVENT_ARG2_TAX_ID;
a1b2Clause.addQuery(taxQuery);
final TermsQuery taxQuery2 = new TermsQuery(Arrays.stream(requestData.getTaxIdsB()).collect(Collectors.toList()));
taxQuery2.field = FIELD_EVENT_ARG1_TAX_ID;
a2b1Clause.addQuery(taxQuery2);
}
if (requestData.getDocId() != null && !requestData.getDocId().isBlank()) {
final MultiMatchQuery docIdQuery = new MultiMatchQuery();
Expand All @@ -102,6 +119,17 @@ public static BoolQuery getClosedQuery(GepiRequestData requestData, Set<String>
return eventQuery;
}

private static void addTaxIdFilter(BoolQuery eventQuery, String[] taxId, String indexSearchField) {
if (taxId != null && taxId.length > 0) {
final TermsQuery taxQuery = new TermsQuery(Arrays.stream(taxId).collect(Collectors.toList()));
taxQuery.field = indexSearchField;
BoolClause taxIdFilterClause = new BoolClause();
taxIdFilterClause.occur = FILTER;
taxIdFilterClause.addQuery(taxQuery);
eventQuery.addClause(taxIdFilterClause);
}
}

public static BoolQuery getOpenQuery(GepiRequestData requestData) throws InterruptedException, ExecutionException {
List<String> eventTypes = requestData.getEventTypes();
String sentenceFilter = requestData.getSentenceFilterString();
Expand Down Expand Up @@ -156,14 +184,8 @@ public static BoolQuery getOpenQuery(GepiRequestData requestData) throws Interru
}
if (requestData.getEventLikelihood() > 1)
addEventLikelihoodFilter(eventQuery, requestData.getEventLikelihood());
if (requestData.getTaxId() != null && requestData.getTaxId().length > 0) {
final TermsQuery taxQuery = new TermsQuery(Arrays.stream(requestData.getTaxId()).collect(Collectors.toList()));
taxQuery.field = FIELD_EVENT_TAX_IDS;
BoolClause taxIdFilterClause = new BoolClause();
taxIdFilterClause.occur = FILTER;
taxIdFilterClause.addQuery(taxQuery);
eventQuery.addClause(taxIdFilterClause);
}
addTaxIdFilter(eventQuery, requestData.getTaxId(), FIELD_EVENT_TAX_IDS);
addTaxIdFilter(eventQuery, requestData.getTaxIdsA(), FIELD_EVENT_ARG1_TAX_ID);
if (requestData.getDocId() != null && !requestData.getDocId().isBlank()) {
final MultiMatchQuery docIdQuery = new MultiMatchQuery();
docIdQuery.query = requestData.getDocId();
Expand Down Expand Up @@ -197,7 +219,6 @@ public static BoolQuery getFulltextQuery(GepiRequestData requestData) {
final String paragraphFilter = requestData.getParagraphFilterString();
final String sectionNameFilter = requestData.getSectionNameFilterString();
final int eventLikelihood = requestData.getEventLikelihood();
final String[] taxIds = requestData.getTaxId();
final String docId = requestData.getDocId();

if (eventTypes != null && !eventTypes.isEmpty()) {
Expand Down Expand Up @@ -236,14 +257,9 @@ public static BoolQuery getFulltextQuery(GepiRequestData requestData) {
if (eventLikelihood > 1) {
addEventLikelihoodFilter(eventQuery, eventLikelihood);
}
if (taxIds != null && taxIds.length > 0) {
final TermsQuery taxQuery = new TermsQuery(Arrays.stream(taxIds).collect(Collectors.toList()));
taxQuery.field = FIELD_EVENT_TAX_IDS;
BoolClause taxIdFilterClause = new BoolClause();
taxIdFilterClause.occur = FILTER;
taxIdFilterClause.addQuery(taxQuery);
eventQuery.addClause(taxIdFilterClause);
}
addTaxIdFilter(eventQuery, requestData.getTaxId(), FIELD_EVENT_TAX_IDS);
addTaxIdFilter(eventQuery, requestData.getTaxIdsA(), FIELD_EVENT_ARG1_TAX_ID);
addTaxIdFilter(eventQuery, requestData.getTaxIdsB(), FIELD_EVENT_ARG2_TAX_ID);
if (docId != null && docId.isBlank()) {
final MultiMatchQuery docIdQuery = new MultiMatchQuery();
docIdQuery.query = docId;
Expand All @@ -269,7 +285,7 @@ private static void addFulltextSearchQuery(String filterQuery, String field, Boo
final SimpleQueryStringQuery textFilterQuery = new SimpleQueryStringQuery();
textFilterQuery.flags = List.of(SimpleQueryStringQuery.Flag.ALL);
textFilterQuery.query = filterQuery;
textFilterQuery.fields = Arrays.asList(field);
textFilterQuery.fields = List.of(field);
final BoolClause textFilterClause = new BoolClause();
textFilterClause.addQuery(textFilterQuery);
textFilterClause.occur = occur;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ public class EventRetrievalService implements IEventRetrievalService {

public static final String FIELD_EVENT_ARG1_GENE_ID = "argument1geneid";

public static final String FIELD_EVENT_ARG1_TAX_ID = "argument1taxid";

public static final String FIELD_EVENT_ARG1_CONCEPT_ID = "argument1conceptid";

public static final String FIELD_EVENT_ARG1_TOP_HOMOLOGY_ID = "argument1tophomoid";
Expand All @@ -71,6 +73,8 @@ public class EventRetrievalService implements IEventRetrievalService {

public static final String FIELD_EVENT_ARG2_GENE_ID = "argument2geneid";

public static final String FIELD_EVENT_ARG2_TAX_ID = "argument2taxid";

public static final String FIELD_EVENT_ARG2_CONCEPT_ID = "argument2conceptid";

public static final String FIELD_EVENT_ARG2_TOP_HOMOLOGY_ID = "argument2tophomoid";
Expand Down Expand Up @@ -211,10 +215,10 @@ public Future<EventRetrievalResult> closedSearch(GepiRequestData requestData, in
return CompletableFuture.supplyAsync(() -> {
try {

log.debug("Retrieving closed events for {} A IDs and {} B IDs", requestData.getListAGePiIds().get().getConvertedItems().size(), requestData.getListBGePiIds().get().getConvertedItems().size());
log.debug("Retrieving closed events for {} A IDs and {} B IDs", requestData.getListAGePiIds().get().getConvertedItems().size(), requestData.getListBGePiIds() != null ? requestData.getListBGePiIds().get().getConvertedItems().size() : 0);
if (log.isDebugEnabled())
log.debug("Some A target IDs are: {}", requestData.getListAGePiIds().get().getTargetIds().stream().limit(10).collect(Collectors.joining(", ")));
if (log.isDebugEnabled())
if (requestData.getListBGePiIds() != null && log.isDebugEnabled())
log.debug("Some B target IDs are: {}", requestData.getListBGePiIds().get().getTargetIds().stream().limit(10).collect(Collectors.joining(", ")));

SearchServerRequest serverRqst = getClosedSearchRequest(requestData, from, numRows, forCharts);
Expand Down Expand Up @@ -245,7 +249,8 @@ public Future<EventRetrievalResult> closedSearch(GepiRequestData requestData, in
}

private SearchServerRequest getClosedSearchRequest(GepiRequestData requestData, int from, int numRows, boolean forCharts) throws ExecutionException, InterruptedException {
BoolQuery eventQuery = EventQueries.getClosedQuery(requestData, requestData.getAListIdsAsSet(), requestData.getBListIdsAsSet());
// List B might be empty because its also valid if there is tax ID filter on the B-side
BoolQuery eventQuery = EventQueries.getClosedQuery(requestData, requestData.getAListIdsAsSet(), requestData.getListBGePiIds() != null ? requestData.getBListIdsAsSet() : Collections.emptySet());

boolean downloadAll = forCharts || numRows == Integer.MAX_VALUE;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.concurrent.Future;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static java.nio.charset.StandardCharsets.UTF_8;

Expand Down Expand Up @@ -85,7 +86,6 @@ public long newSession() {
return id;
}

@Log
@Override
public GePiData getData(long sessionId) {
GePiData data = dataCache.getIfPresent(sessionId);
Expand Down Expand Up @@ -323,7 +323,8 @@ public JSONArray convertToJson(List<Event> eventList) {
}

@Override
public Path getOverviewExcel(Future<EventRetrievalResult> eventRetrievalResult, long dataSessionId, EnumSet<InputMode> inputMode, String sentenceFilterString, String paragraphFilterString, String sectionNameFilterString) throws IOException, ExecutionException, InterruptedException {
public Path getOverviewExcel(Future<EventRetrievalResult> eventRetrievalResult, GepiRequestData requestData) throws IOException, ExecutionException, InterruptedException {
final long dataSessionId = requestData.getDataSessionId();
long time = System.currentTimeMillis();
log.info("Creating event statistics Excel file for dataSessionId {}", dataSessionId);
final Path tempStatusFile = getTempStatusFile(dataSessionId);
Expand All @@ -337,7 +338,7 @@ public Path getOverviewExcel(Future<EventRetrievalResult> eventRetrievalResult,
Path xlsFile = getTempXlsDataFile(dataSessionId);
writeOverviewTsvFile(eventRetrievalResult.get().getEventList(), tsvFile);
updateDownloadFileCreationsStatus("Step 2 of 3: Retrieval of all interactions has finished. Creating Excel file.", dataSessionId);
createExcelSummaryFile(tsvFile, xlsFile, inputMode, sentenceFilterString, paragraphFilterString, sectionNameFilterString);
createExcelSummaryFile(tsvFile, xlsFile, requestData);
updateDownloadFileCreationsStatus(EXCEL_FILE_SUCCESS_STATE + " The file is ready for download.", dataSessionId);
time = System.currentTimeMillis() - time;
log.info("Excel sheet creation took {} seconds", time / 1000);
Expand Down Expand Up @@ -367,8 +368,33 @@ private void updateDownloadFileCreationsStatus(String status, long dataSessionId
Files.writeString(tempStatusFile, status);
}

private void createExcelSummaryFile(Path tsvFile, Path xlsFile, EnumSet<InputMode> inputMode, String sentenceFilterString, String paragraphFilterString, String sectionNameFilterString) throws IOException {
ProcessBuilder builder = new ProcessBuilder().command("python3", "-c", excelResultCreationScript, tsvFile.toAbsolutePath().toString(), xlsFile.toAbsolutePath().toString(), inputMode.stream().map(InputMode::name).collect(Collectors.joining(" ")), sentenceFilterString != null ? sentenceFilterString : "<none>", paragraphFilterString != null ? paragraphFilterString : "<none>", sectionNameFilterString != null ? sectionNameFilterString : "<none>");
private void createExcelSummaryFile(Path tsvFile, Path xlsFile, GepiRequestData requestData) throws IOException {
final EnumSet<InputMode> inputMode = requestData.getInputMode();
final String sentenceFilterString = requestData.getSentenceFilterString();
final String paragraphFilterString = requestData.getParagraphFilterString();
final String sectionNameFilterString = requestData.getSectionNameFilterString();
final List<String> eventTypes = requestData.getEventTypes();
final boolean includeUnary = requestData.isIncludeUnary();
final String[] taxId = requestData.getTaxId();
final String[] taxIdsA = requestData.getTaxIdsA();
final String[] taxIdsB = requestData.getTaxIdsB();
Map<Integer, String> likelihood2string = Map.of(1, "negation", 2, "low", 3, "investigation", 4, "moderate", 5, "high", 6, "assertion");

ProcessBuilder builder = new ProcessBuilder().command("python3", "-c",
excelResultCreationScript,
tsvFile.toAbsolutePath().toString(),
xlsFile.toAbsolutePath().toString(),
inputMode.stream().map(InputMode::name).collect(Collectors.joining(" ")),
sentenceFilterString != null ? sentenceFilterString : "<none>",
paragraphFilterString != null ? paragraphFilterString : "<none>",
sectionNameFilterString != null ? sectionNameFilterString : "<none>",
eventTypes.stream().collect(Collectors.joining(" ")),
Boolean.toString(includeUnary),
likelihood2string.get(requestData.getEventLikelihood()),
taxId != null ? Stream.of(taxId).collect(Collectors.joining(" ")) : "<none>",
taxIdsA != null ? Stream.of(taxIdsA).collect(Collectors.joining(" ")) : "<none>",
taxIdsB != null ? Stream.of(taxIdsB).collect(Collectors.joining(" ")) : "<none>"
);
log.info("xls builder command: {}", builder.command());
Process process = builder.start();
InputStream processInput = process.getInputStream();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ public Future<IdConversionResult> convert(Stream<String> stream, IdType to) {
return CompletableFuture.supplyAsync(() -> {
long time = System.currentTimeMillis();
final Multimap<IdType, String> idsByType = determineIdTypes(stream);
log.debug("Starting to convert {} input IDs for GePI retrieval.");
final List<Future<IdConversionResult>> convertedIds = new ArrayList<>();
for (IdType from : idsByType.keySet()) {
final Collection<String> sourceIds = idsByType.get(from);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.nio.file.Path;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
Expand Down Expand Up @@ -75,13 +74,10 @@ public interface IGePiDataService {
* <p>To do this, the event data is written to a temporary file, a Python-Pandas script is applied and the
* resulting Excel file is then read back in the form of the InputStream.</p>
* @param events The events to create the result workbook for.
* @param inputMode
* @param sentenceFilterString
* @param paragraphFilterString
* @param sectionNameFilterString
* @param requestData
* @return An InputStream of the created Excel file.
*/
Path getOverviewExcel(Future<EventRetrievalResult> events, long dataSessionId, EnumSet<InputMode> inputMode, String sentenceFilterString, String paragraphFilterString, String sectionNameFilterString) throws IOException, ExecutionException, InterruptedException;
Path getOverviewExcel(Future<EventRetrievalResult> events, GepiRequestData requestData) throws IOException, ExecutionException, InterruptedException;

String getDownloadFileCreationStatus(long dataSessionId) throws IOException;

Expand Down
Loading

0 comments on commit 8db71d9

Please sign in to comment.