diff --git a/doc/release-notes/10379-MetricsBugsFixes.md b/doc/release-notes/10379-MetricsBugsFixes.md new file mode 100644 index 00000000000..0ebc6d99f0b --- /dev/null +++ b/doc/release-notes/10379-MetricsBugsFixes.md @@ -0,0 +1,10 @@ + +### Metrics API Bug fixes + +Two bugs in the Metrics API have been fixed: + +- The /datasets and /datasets/byMonth endpoints could report incorrect values if/when they have been called using the dataLocation parameter (which allows getting metrics for local, remote (harvested), or all datasets) as the metrics cache was not storing different values for these cases. + +- Metrics endpoints who's calculation relied on finding the latest published datasetversion were incorrect if/when the minor version number was > 9. + +When deploying the new release, the [/api/admin/clearMetricsCache](https://guides.dataverse.org/en/latest/api/native-api.html#metrics) API should be called to remove old cached values that may be incorrect. \ No newline at end of file diff --git a/doc/release-notes/10708 - MDC Citation and DOI parsing improvements.md b/doc/release-notes/10708 - MDC Citation and DOI parsing improvements.md new file mode 100644 index 00000000000..1dcd293df77 --- /dev/null +++ b/doc/release-notes/10708 - MDC Citation and DOI parsing improvements.md @@ -0,0 +1,3 @@ +MDC Citation retrieval with the PID settings has been fixed. +DOI parsing in Dataverse is case insensitive, improving interaction with services that may change the case. +Warnings related to managed/excluded PID lists for PID providers have been reduced diff --git a/doc/release-notes/10886-update-to-conditions-to-display-image_url.md b/doc/release-notes/10886-update-to-conditions-to-display-image_url.md new file mode 100644 index 00000000000..6dfe8eb9f2d --- /dev/null +++ b/doc/release-notes/10886-update-to-conditions-to-display-image_url.md @@ -0,0 +1,8 @@ +Search API (/api/search) responses for Datafiles include image_url for the thumbnail if each of the following are true: +1. The DataFile is not Harvested +2. A Thumbnail is available for the Datafile +3. If the Datafile is Restricted then the caller must have Download File Permission for the Datafile +4. The Datafile is NOT actively embargoed +5. The Datafile's retention period has NOT expired + +See also #10875 and #10886. diff --git a/doc/sphinx-guides/source/style/text.rst b/doc/sphinx-guides/source/style/text.rst index 4fb2352300c..10fbd08da4a 100644 --- a/doc/sphinx-guides/source/style/text.rst +++ b/doc/sphinx-guides/source/style/text.rst @@ -9,4 +9,4 @@ Here we describe the guidelines that help us provide helpful, clear and consiste Metadata Text Guidelines ======================== -These guidelines are maintained in `a Google Doc `__ as we expect to make frequent changes to them. We welcome comments in the Google Doc. \ No newline at end of file +These guidelines are maintained in `a Google Doc `__ as we expect to make frequent changes to them. We welcome comments in the Google Doc. diff --git a/pom.xml b/pom.xml index 1e70ff68480..b2344989569 100644 --- a/pom.xml +++ b/pom.xml @@ -72,7 +72,7 @@ org.apache.james apache-mime4j-core - 0.8.7 + 0.8.10 org.apache.james diff --git a/src/main/java/edu/harvard/iq/dataverse/ThumbnailServiceWrapper.java b/src/main/java/edu/harvard/iq/dataverse/ThumbnailServiceWrapper.java index 542cf39cfbe..46736da73d4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ThumbnailServiceWrapper.java +++ b/src/main/java/edu/harvard/iq/dataverse/ThumbnailServiceWrapper.java @@ -5,6 +5,7 @@ */ package edu.harvard.iq.dataverse; +import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.dataaccess.DataAccess; import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter; import edu.harvard.iq.dataverse.dataaccess.StorageIO; @@ -20,7 +21,6 @@ import jakarta.ejb.EJB; import jakarta.enterprise.context.RequestScoped; -import jakarta.inject.Inject; import jakarta.inject.Named; /** @@ -33,9 +33,8 @@ public class ThumbnailServiceWrapper implements java.io.Serializable { private static final Logger logger = Logger.getLogger(ThumbnailServiceWrapper.class.getCanonicalName()); - - @Inject - PermissionsWrapper permissionsWrapper; + @EJB + PermissionServiceBean permissionService; @EJB DataverseServiceBean dataverseService; @EJB @@ -49,12 +48,15 @@ public class ThumbnailServiceWrapper implements java.io.Serializable { private Map dvobjectViewMap = new HashMap<>(); private Map hasThumbMap = new HashMap<>(); + private boolean hasDownloadFilePermission(DvObject dvo) { + return permissionService.on(dvo).has(Permission.DownloadFile) ; + } public String getFileCardImageAsUrl(SolrSearchResult result) { DataFile dataFile = result != null && result.getEntity() != null ? ((DataFile) result.getEntity()) : null; - if (dataFile == null || result.isHarvested() + if (dataFile == null + || result.isHarvested() || !isThumbnailAvailable(dataFile) - || dataFile.isRestricted() - || !dataFile.isReleased() + || (dataFile.isRestricted() && !hasDownloadFilePermission(dataFile)) || FileUtil.isActivelyEmbargoed(dataFile) || FileUtil.isRetentionExpired(dataFile)) { return null; @@ -105,7 +107,7 @@ public String getFileCardImageAsBase64Url(SolrSearchResult result) { } if ((!((DataFile)result.getEntity()).isRestricted() - || permissionsWrapper.hasDownloadFilePermission(result.getEntity())) + || hasDownloadFilePermission(result.getEntity())) && isThumbnailAvailable((DataFile) result.getEntity())) { cardImageUrl = ImageThumbConverter.getImageThumbnailAsBase64( diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Dataverses.java b/src/main/java/edu/harvard/iq/dataverse/api/Dataverses.java index 0ee146ed99b..2be6b1e51c2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Dataverses.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Dataverses.java @@ -621,62 +621,22 @@ public Response deleteDataverse(@Context ContainerRequestContext crc, @PathParam public Response updateAttribute(@Context ContainerRequestContext crc, @PathParam("identifier") String identifier, @PathParam("attribute") String attribute, @QueryParam("value") String value) { try { - Dataverse collection = findDataverseOrDie(identifier); - User user = getRequestUser(crc); - DataverseRequest dvRequest = createDataverseRequest(user); - - // TODO: The cases below use hard coded strings, because we have no place for definitions of those! - // They are taken from util.json.JsonParser / util.json.JsonPrinter. This shall be changed. - // This also should be extended to more attributes, like the type, theme, contacts, some booleans, etc. - switch (attribute) { - case "alias": - collection.setAlias(value); - break; - case "name": - collection.setName(value); - break; - case "description": - collection.setDescription(value); - break; - case "affiliation": - collection.setAffiliation(value); - break; - /* commenting out the code from the draft pr #9462: - case "versionPidsConduct": - CollectionConduct conduct = CollectionConduct.findBy(value); - if (conduct == null) { - return badRequest("'" + value + "' is not one of [" + - String.join(",", CollectionConduct.asList()) + "]"); - } - collection.setDatasetVersionPidConduct(conduct); - break; - */ - case "filePIDsEnabled": - if(!user.isSuperuser()) { - return forbidden("You must be a superuser to change this setting"); - } - if(!settingsService.isTrueForKey(SettingsServiceBean.Key.AllowEnablingFilePIDsPerCollection, false)) { - return forbidden("Changing File PID policy per collection is not enabled on this server"); - } - collection.setFilePIDsEnabled(parseBooleanOrDie(value)); - break; - default: - return badRequest("'" + attribute + "' is not a supported attribute"); - } - - // Off to persistence layer - execCommand(new UpdateDataverseCommand(collection, null, null, dvRequest, null)); - - // Also return modified collection to user - return ok("Update successful", JsonPrinter.json(collection)); - - // TODO: This is an anti-pattern, necessary due to this bean being an EJB, causing very noisy and unnecessary - // logging by the EJB container for bubbling exceptions. (It would be handled by the error handlers.) + Dataverse dataverse = findDataverseOrDie(identifier); + Object formattedValue = formatAttributeValue(attribute, value); + dataverse = execCommand(new UpdateDataverseAttributeCommand(createDataverseRequest(getRequestUser(crc)), dataverse, attribute, formattedValue)); + return ok("Update successful", JsonPrinter.json(dataverse)); } catch (WrappedResponse e) { return e.getResponse(); } } + private Object formatAttributeValue(String attribute, String value) throws WrappedResponse { + if (attribute.equals("filePIDsEnabled")) { + return parseBooleanOrDie(value); + } + return value; + } + @GET @AuthRequired @Path("{identifier}/inputLevels") diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 1f2f1039327..306b863c9e4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -19,6 +19,9 @@ import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Iterator; import java.util.List; @@ -152,10 +155,17 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE // DataCite wants "doi=", not "doi:". String authorityPlusIdentifier = persistentId.replaceFirst("doi:", ""); // Request max page size and then loop to handle multiple pages - URL url = new URL(JvmSettings.DATACITE_REST_API_URL.lookup() + + URL url = null; + try { + url = new URI(JvmSettings.DATACITE_REST_API_URL.lookup(pidProvider.getId()) + "/events?doi=" + authorityPlusIdentifier + - "&source=crossref&page[size]=1000"); + "&source=crossref&page[size]=1000").toURL(); + } catch (URISyntaxException e) { + //Nominally this means a config error/ bad DATACITE_REST_API_URL for this provider + logger.warning("Unable to create URL for " + persistentId + ", pidProvider " + pidProvider.getId()); + return error(Status.INTERNAL_SERVER_ERROR, "Unable to create DataCite URL to retrieve citations."); + } logger.fine("Retrieving Citations from " + url.toString()); boolean nextPage = true; JsonArrayBuilder dataBuilder = Json.createArrayBuilder(); @@ -178,7 +188,12 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE dataBuilder.add(iter.next()); } if (links.containsKey("next")) { - url = new URL(links.getString("next")); + try { + url = new URI(links.getString("next")).toURL(); + } catch (URISyntaxException e) { + logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); + return error(Status.INTERNAL_SERVER_ERROR, "Unable to retrieve all results from DataCite"); + } } else { nextPage = false; } @@ -187,7 +202,7 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE JsonArray allData = dataBuilder.build(); List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); /* - * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopuate them. + * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them. * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation * as we've done for 'hasPart'). * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java b/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java index 452e5df9f9a..f36c514859e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Metrics.java @@ -206,12 +206,13 @@ public Response getDatasetsTimeSeriest(@Context Request req, @Context UriInfo ur return error(BAD_REQUEST, ia.getLocalizedMessage()); } String metricName = "datasets"; - JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, null, d)); + String validDataLocation = MetricsUtil.validateDataLocationStringType(dataLocation); + JsonArray jsonArray = MetricsUtil.stringToJsonArray(metricsSvc.returnUnexpiredCacheAllTime(metricName, validDataLocation, d)); if (null == jsonArray) { // run query and save - jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, dataLocation, d); - metricsSvc.save(new Metric(metricName, null, null, d, jsonArray.toString())); + jsonArray = metricsSvc.getDatasetsTimeSeries(uriInfo, validDataLocation, d); + metricsSvc.save(new Metric(metricName, null, validDataLocation, d, jsonArray.toString())); } MediaType requestedType = getVariant(req, MediaType.valueOf(FileUtil.MIME_TYPE_CSV), MediaType.APPLICATION_JSON_TYPE); if ((requestedType != null) && (requestedType.equals(MediaType.APPLICATION_JSON_TYPE))) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index d2bba56f884..66f48bfb872 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -7,7 +7,6 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DatasetFieldConstant; @@ -20,6 +19,7 @@ import edu.harvard.iq.dataverse.DataverseContact; import edu.harvard.iq.dataverse.DataverseServiceBean; import edu.harvard.iq.dataverse.EjbDataverseEngine; +import edu.harvard.iq.dataverse.GlobalId; import edu.harvard.iq.dataverse.MetadataBlockServiceBean; import edu.harvard.iq.dataverse.api.dto.DatasetDTO; import edu.harvard.iq.dataverse.api.imports.ImportUtil.ImportType; @@ -31,6 +31,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.CreateHarvestedDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.CreateNewDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand; +import edu.harvard.iq.dataverse.engine.command.impl.UpdateHarvestedDatasetCommand; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; import edu.harvard.iq.dataverse.search.IndexServiceBean; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; @@ -40,6 +41,7 @@ import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.license.LicenseServiceBean; import edu.harvard.iq.dataverse.pidproviders.PidUtil; +import edu.harvard.iq.dataverse.util.DatasetFieldUtil; import java.io.File; import java.io.FileOutputStream; @@ -208,7 +210,7 @@ public JsonObjectBuilder handleFile(DataverseRequest dataverseRequest, Dataverse @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, Date oaiDateStamp, PrintWriter cleanupLog) throws ImportException, IOException { if (harvestingClient == null || harvestingClient.getDataverse() == null) { - throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient."); + throw new ImportException("importHarvestedDataset called with a null harvestingClient, or an invalid harvestingClient."); } Dataverse owner = harvestingClient.getDataverse(); Dataset importedDataset = null; @@ -268,116 +270,124 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve } JsonObject obj = JsonUtil.getJsonObject(json); - //and call parse Json to read it into a dataset + + String protocol = obj.getString("protocol", null); + String authority = obj.getString("authority", null); + String identifier = obj.getString("identifier",null); + + GlobalId globalId; + + // A Global ID is required: + // (meaning, we will fail with an exception if the imports above have + // not managed to find an acceptable global identifier in the harvested + // metadata) + + try { + globalId = PidUtil.parseAsGlobalID(protocol, authority, identifier); + } catch (IllegalArgumentException iax) { + throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse can parse, skipping."); + } + + if (globalId == null) { + throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping."); + } + + String globalIdString = globalId.asString(); + + if (StringUtils.isEmpty(globalIdString)) { + // @todo this check may not be necessary, now that there's a null check above + throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global identifier this Dataverse recognizes, skipping."); + } + + DatasetVersion harvestedVersion; + + Dataset existingDataset = datasetService.findByGlobalId(globalIdString); + try { + Dataset harvestedDataset; + JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, datasetTypeService, harvestingClient); parser.setLenient(true); - Dataset ds = parser.parseDataset(obj); - - // For ImportType.NEW, if the metadata contains a global identifier, and it's not a protocol - // we support, it should be rejected. - // (TODO: ! - add some way of keeping track of supported protocols!) - //if (ds.getGlobalId() != null && !ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol, ""))) { - // throw new ImportException("Could not register id " + ds.getGlobalId() + ", protocol not supported"); - //} - ds.setOwner(owner); - ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields()); - if (ds.getVersions().get(0).getReleaseTime() == null) { - ds.getVersions().get(0).setReleaseTime(oaiDateStamp); - } - - // Check data against required contraints - List> violations = ds.getVersions().get(0).validateRequired(); - if (!violations.isEmpty()) { - // For migration and harvest, add NA for missing required values - for (ConstraintViolation v : violations) { - DatasetField f = v.getRootBean(); - f.setSingleValue(DatasetField.NA_VALUE); + if (existingDataset == null) { + // Creating a new dataset from scratch: + + harvestedDataset = parser.parseDataset(obj); + + harvestedDataset.setHarvestedFrom(harvestingClient); + harvestedDataset.setHarvestIdentifier(harvestIdentifier); + + harvestedVersion = harvestedDataset.getVersions().get(0); + } else { + // We already have a dataset with this id in the database. + // Let's check a few things before we go any further with it: + + // If this dataset already exists IN ANOTHER COLLECTION + // we are just going to skip it! + if (existingDataset.getOwner() != null && !owner.getId().equals(existingDataset.getOwner().getId())) { + throw new ImportException("The dataset with the global id " + globalIdString + " already exists, in the dataverse " + existingDataset.getOwner().getAlias() + ", skipping."); } - } - - // Check data against validation constraints - // If we are migrating and "scrub migration data" is true we attempt to fix invalid data - // if the fix fails stop processing of this file by throwing exception - Set invalidViolations = ds.getVersions().get(0).validate(); - ValidatorFactory factory = Validation.buildDefaultValidatorFactory(); - Validator validator = factory.getValidator(); - if (!invalidViolations.isEmpty()) { - for (ConstraintViolation v : invalidViolations) { - DatasetFieldValue f = v.getRootBean(); - boolean fixed = false; - boolean converted = false; - // TODO: Is this scrubbing something we want to continue doing? - if (settingsService.isTrueForKey(SettingsServiceBean.Key.ScrubMigrationData, false)) { - fixed = processMigrationValidationError(f, cleanupLog, metadataFile.getName()); - converted = true; - if (fixed) { - Set> scrubbedViolations = validator.validate(f); - if (!scrubbedViolations.isEmpty()) { - fixed = false; - } - } - } - if (!fixed) { - String msg = "Data modified - File: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " - + "Invalid value: '" + f.getValue() + "'" + " Converted Value:'" + DatasetField.NA_VALUE + "'"; - cleanupLog.println(msg); - f.setValue(DatasetField.NA_VALUE); - - } + // And if we already have a dataset with this same global id at + // this Dataverse instance, but it is a LOCAL dataset (can happen!), + // we're going to skip it also: + if (!existingDataset.isHarvested()) { + throw new ImportException("A LOCAL dataset with the global id " + globalIdString + " already exists in this dataverse; skipping."); + } + // For harvested datasets, there should always only be one version. + if (existingDataset.getVersions().size() != 1) { + throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDataset.getVersions().size() + " versions"); } + + // We will attempt to import the new version, and replace the + // current, already existing version with it. + harvestedVersion = parser.parseDatasetVersion(obj.getJsonObject("datasetVersion")); + + // For the purposes of validation, the version needs to be attached + // to a non-null dataset. We will create a throwaway temporary + // dataset for this: + harvestedDataset = createTemporaryHarvestedDataset(harvestedVersion); } + + harvestedDataset.setOwner(owner); - // A Global ID is required, in order for us to be able to harvest and import - // this dataset: - if (StringUtils.isEmpty(ds.getGlobalId().asString())) { - throw new ImportException("The harvested metadata record with the OAI server identifier "+harvestIdentifier+" does not contain a global unique identifier that we could recognize, skipping."); - } - - ds.setHarvestedFrom(harvestingClient); - ds.setHarvestIdentifier(harvestIdentifier); + // Either a full new import, or an update of an existing harvested + // Dataset, perform some cleanup on the new version imported from the + // parsed metadata: - Dataset existingDs = datasetService.findByGlobalId(ds.getGlobalId().asString()); + harvestedVersion.setDatasetFields(harvestedVersion.initDatasetFields()); - if (existingDs != null) { - // If this dataset already exists IN ANOTHER DATAVERSE - // we are just going to skip it! - if (existingDs.getOwner() != null && !owner.getId().equals(existingDs.getOwner().getId())) { - throw new ImportException("The dataset with the global id "+ds.getGlobalId().asString()+" already exists, in the dataverse "+existingDs.getOwner().getAlias()+", skipping."); - } - // And if we already have a dataset with this same id, in this same - // dataverse, but it is LOCAL dataset (can happen!), we're going to - // skip it also: - if (!existingDs.isHarvested()) { - throw new ImportException("A LOCAL dataset with the global id "+ds.getGlobalId().asString()+" already exists in this dataverse; skipping."); - } - // For harvested datasets, there should always only be one version. - // We will replace the current version with the imported version. - if (existingDs.getVersions().size() != 1) { - throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDs.getVersions().size() + " versions"); - } - // Purge all the SOLR documents associated with this client from the - // index server: - indexService.deleteHarvestedDocuments(existingDs); - // files from harvested datasets are removed unceremoniously, - // directly in the database. no need to bother calling the - // DeleteFileCommand on them. - for (DataFile harvestedFile : existingDs.getFiles()) { - DataFile merged = em.merge(harvestedFile); - em.remove(merged); - harvestedFile = null; - } - // TODO: - // Verify what happens with the indexed files in SOLR? - // are they going to be overwritten by the reindexing of the dataset? - existingDs.setFiles(null); - Dataset merged = em.merge(existingDs); - // harvested datasets don't have physical files - so no need to worry about that. - engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest)); + if (harvestedVersion.getReleaseTime() == null) { + harvestedVersion.setReleaseTime(oaiDateStamp); + } + + // is this the right place to call tidyUpFields()? + // usually it is called within the body of the create/update commands + // later on. + DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true); + + // Check data against validation constraints. + // Make an attempt to sanitize any invalid fields encountered - + // missing required fields or invalid values, by filling the values + // with NAs. + + boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog); + + // Note: this sanitizing approach, of replacing invalid values with + // "NA" does not work with certain fields. For example, using it to + // populate a GeoBox coordinate value will result in an invalid + // field. So we will attempt to re-validate the santized version. + // This time around, it will throw an exception if still invalid, so + // that we'll stop before proceeding any further: + + if (sanitized) { + validateVersionMetadata(harvestedVersion, cleanupLog); } - importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(ds, dataverseRequest)); + if (existingDataset != null) { + importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest)); + } else { + importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest)); + } } catch (JsonParseException | ImportException | CommandException ex) { logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage()); @@ -439,7 +449,7 @@ public JsonObjectBuilder doImport(DataverseRequest dataverseRequest, Dataverse o ds.setOwner(owner); ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields()); - // Check data against required contraints + // Check data against required constraints List> violations = ds.getVersions().get(0).validateRequired(); if (!violations.isEmpty()) { if ( importType.equals(ImportType.HARVEST) ) { @@ -696,6 +706,84 @@ private String convertInvalidDateString(String inString){ return null; } + /** + * A shortcut method for validating AND attempting to sanitize a DatasetVersion + * @param version + * @param cleanupLog - any invalid values and their replacements are logged there + * @return true if any invalid values were encountered and sanitized + * @throws ImportException (although it should never happen in this mode) + */ + private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException { + return validateVersionMetadata(version, true, cleanupLog); + } + + /** + * A shortcut method for validating a DatasetVersion; will throw an exception + * if invalid, without attempting to sanitize the invalid values. + * @param version + * @param log - will log the invalid fields encountered there + * @throws ImportException + */ + private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException { + validateVersionMetadata(version, false, log); + } + + /** + * Validate the metadata fields of a newly-created version, and depending on + * the "sanitize" flag supplied, may or may not attempt to sanitize the supplied + * values by replacing them with "NA"s. + * @param version + * @param sanitize - boolean indicating whether to attempt to fix invalid values + * @param cleanupLog - to log any invalid values encountered will be logged + * @return - true if any invalid values have been replaced + * @throws ImportException + */ + private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException { + boolean fixed = false; + Set invalidViolations = version.validate(); + if (!invalidViolations.isEmpty()) { + for (ConstraintViolation v : invalidViolations) { + DatasetFieldValue f = v.getRootBean(); + + String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + + "Invalid value: '" + f.getValue() + "'"; + if (sanitize) { + msg += ", replaced with '" + DatasetField.NA_VALUE + "'"; + f.setValue(DatasetField.NA_VALUE); + fixed = true; + } + cleanupLog.println(msg); + + // Note: "NA" does not work with certain fields. For example, + // using it to populate a GeoBox coordinate value is going + // to result in an invalid field. So we'll need to validate the + // version again after the first, sanitizing pass and see if it + // helped or not. + } + if (!sanitize) { + throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values."); + } + } + return fixed; + } + + /** + * Helper method that creates a throwaway Harvested Dataset to temporarily + * attach the newly-harvested version to. We need this when, instead of + * importing a brand-new harvested dataset from scratch, we are planning to + * attempt to update an already existing dataset harvested from the same + * archival location. + * @param harvestedVersion - a newly created Version imported from harvested metadata + * @return - a temporary dataset to which the new version has been attached + */ + private Dataset createTemporaryHarvestedDataset(DatasetVersion harvestedVersion) { + Dataset tempDataset = new Dataset(); + harvestedVersion.setDataset(tempDataset); + tempDataset.setVersions(new ArrayList<>(1)); + tempDataset.getVersions().add(harvestedVersion); + + return tempDataset; + } private static class MyCustomFormatter extends Formatter { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java index db9dc142506..b36a638956f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractCreateDatasetCommand.java @@ -13,8 +13,10 @@ import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.pidproviders.PidProvider; import static edu.harvard.iq.dataverse.util.StringUtil.isEmpty; +import java.io.IOException; import java.util.Objects; import java.util.logging.Logger; +import org.apache.solr.client.solrj.SolrServerException; /**; * An abstract base class for commands that creates {@link Dataset}s. @@ -148,9 +150,19 @@ public Dataset execute(CommandContext ctxt) throws CommandException { //Use for code that requires database ids postDBFlush(theDataset, ctxt); - - ctxt.index().asyncIndexDataset(theDataset, true); - + + if (harvested) { + try { + ctxt.index().indexDataset(theDataset, true); + } catch (SolrServerException | IOException solrEx) { + logger.warning("Failed to index harvested dataset. " + solrEx.getMessage()); + } + } else { + // The asynchronous version does not throw any exceptions, + // logging them internally instead. + ctxt.index().asyncIndexDataset(theDataset, true); + } + return theDataset; } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java index 3a21345448b..76939751899 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java @@ -2,34 +2,29 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException; import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker; -import static edu.harvard.iq.dataverse.datasetutility.FileSizeChecker.bytesToHumanReadable; import edu.harvard.iq.dataverse.engine.command.AbstractCommand; import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; -//import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.engine.command.exception.CommandExecutionException; import edu.harvard.iq.dataverse.ingest.IngestServiceShapefileHelper; -import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.storageuse.UploadSessionQuotaLimit; -import edu.harvard.iq.dataverse.util.file.FileExceedsStorageQuotaException; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.FileUtil; -import static edu.harvard.iq.dataverse.util.FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT; -import static edu.harvard.iq.dataverse.util.FileUtil.createIngestFailureReport; -import static edu.harvard.iq.dataverse.util.FileUtil.determineFileType; -import static edu.harvard.iq.dataverse.util.FileUtil.determineFileTypeByNameAndExtension; -import static edu.harvard.iq.dataverse.util.FileUtil.getFilesTempDirectory; -import static edu.harvard.iq.dataverse.util.FileUtil.saveInputStreamInTempFile; -import static edu.harvard.iq.dataverse.util.FileUtil.useRecognizedType; import edu.harvard.iq.dataverse.util.ShapefileHandler; import edu.harvard.iq.dataverse.util.StringUtil; import edu.harvard.iq.dataverse.util.file.BagItFileHandler; import edu.harvard.iq.dataverse.util.file.BagItFileHandlerFactory; import edu.harvard.iq.dataverse.util.file.CreateDataFileResult; +import edu.harvard.iq.dataverse.util.file.FileExceedsStorageQuotaException; +import jakarta.enterprise.inject.spi.CDI; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; + import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -42,7 +37,7 @@ import java.text.MessageFormat; import java.util.ArrayList; import java.util.Arrays; -import java.util.Enumeration; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -51,12 +46,17 @@ import java.util.Set; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; -import java.util.zip.ZipFile; import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; -import jakarta.enterprise.inject.spi.CDI; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; +import java.util.zip.ZipFile; + +import static edu.harvard.iq.dataverse.datasetutility.FileSizeChecker.bytesToHumanReadable; +import static edu.harvard.iq.dataverse.util.FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT; +import static edu.harvard.iq.dataverse.util.FileUtil.createIngestFailureReport; +import static edu.harvard.iq.dataverse.util.FileUtil.determineFileType; +import static edu.harvard.iq.dataverse.util.FileUtil.determineFileTypeByNameAndExtension; +import static edu.harvard.iq.dataverse.util.FileUtil.getFilesTempDirectory; +import static edu.harvard.iq.dataverse.util.FileUtil.saveInputStreamInTempFile; +import static edu.harvard.iq.dataverse.util.FileUtil.useRecognizedType; /** * @@ -140,9 +140,10 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException if (newStorageIdentifier == null) { - if (getFilesTempDirectory() != null) { + var filesTempDirectory = getFilesTempDirectory(); + if (filesTempDirectory != null) { try { - tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload"); + tempFile = Files.createTempFile(Paths.get(filesTempDirectory), "tmp", "upload"); // "temporary" location is the key here; this is why we are not using // the DataStore framework for this - the assumption is that // temp files will always be stored on the local filesystem. @@ -260,10 +261,6 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException // DataFile objects from its contents: } else if (finalType.equals("application/zip")) { - ZipFile zipFile = null; - ZipInputStream unZippedIn = null; - ZipEntry zipEntry = null; - int fileNumberLimit = ctxt.systemConfig().getZipUploadFilesLimit(); Long combinedUnzippedFileSize = 0L; @@ -271,14 +268,14 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException Charset charset = null; /* TODO: (?) - We may want to investigate somehow letting the user specify + We may want to investigate somehow letting the user specify the charset for the filenames in the zip file... - - otherwise, ZipInputStream bails out if it encounteres a file - name that's not valid in the current charest (i.e., UTF-8, in - our case). It would be a bit trickier than what we're doing for - SPSS tabular ingests - with the lang. encoding pulldown menu - + - otherwise, ZipInputStream bails out if it encounteres a file + name that's not valid in the current charest (i.e., UTF-8, in + our case). It would be a bit trickier than what we're doing for + SPSS tabular ingests - with the lang. encoding pulldown menu - because this encoding needs to be specified *before* we upload and - attempt to unzip the file. + attempt to unzip the file. -- L.A. 4.0 beta12 logger.info("default charset is "+Charset.defaultCharset().name()); if (Charset.isSupported("US-ASCII")) { @@ -287,25 +284,21 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException if (charset != null) { logger.info("was able to obtain charset for US-ASCII"); } - + } */ - /** - * Perform a quick check for how many individual files are - * inside this zip archive. If it's above the limit, we can - * give up right away, without doing any unpacking. + /** + * Perform a quick check for how many individual files are + * inside this zip archive. If it's above the limit, we can + * give up right away, without doing any unpacking. * This should be a fairly inexpensive operation, we just need - * to read the directory at the end of the file. + * to read the directory at the end of the file. */ - - if (charset != null) { - zipFile = new ZipFile(tempFile.toFile(), charset); - } else { - zipFile = new ZipFile(tempFile.toFile()); - } + + /** - * The ZipFile constructors above will throw ZipException - + * The ZipFile constructors in openZipFile will throw ZipException - * a type of IOException - if there's something wrong * with this file as a zip. There's no need to intercept it * here, it will be caught further below, with other IOExceptions, @@ -313,8 +306,8 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException * then attempt to save it as is. */ - int numberOfUnpackableFiles = 0; - + int numberOfUnpackableFiles = 0; + /** * Note that we can't just use zipFile.size(), * unfortunately, since that's the total number of entries, @@ -323,83 +316,46 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException * that are files. */ - for (Enumeration entries = zipFile.entries(); entries.hasMoreElements();) { - ZipEntry entry = entries.nextElement(); - logger.fine("inside first zip pass; this entry: "+entry.getName()); - if (!entry.isDirectory()) { - String shortName = entry.getName().replaceFirst("^.*[\\/]", ""); - // ... and, finally, check if it's a "fake" file - a zip archive entry - // created for a MacOS X filesystem element: (these - // start with "._") - if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) { - numberOfUnpackableFiles++; - if (numberOfUnpackableFiles > fileNumberLimit) { - logger.warning("Zip upload - too many files in the zip to process individually."); - warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit - + "); please upload a zip archive with fewer files, if you want them to be ingested " - + "as individual DataFiles."; - throw new IOException(); - } - // In addition to counting the files, we can - // also check the file size while we're here, - // provided the size limit is defined; if a single - // file is above the individual size limit, unzipped, - // we give up on unpacking this zip archive as well: - if (fileSizeLimit != null && entry.getSize() > fileSizeLimit) { - throw new FileExceedsMaxSizeException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(entry.getSize()), bytesToHumanReadable(fileSizeLimit))); - } - // Similarly, we want to check if saving all these unpacked - // files is going to push the disk usage over the - // quota: - if (storageQuotaLimit != null) { - combinedUnzippedFileSize = combinedUnzippedFileSize + entry.getSize(); - if (combinedUnzippedFileSize > storageQuotaLimit) { - //throw new FileExceedsStorageQuotaException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.quota_exceeded"), bytesToHumanReadable(combinedUnzippedFileSize), bytesToHumanReadable(storageQuotaLimit))); - // change of plans: if the unzipped content inside exceeds the remaining quota, - // we reject the upload outright, rather than accepting the zip - // file as is. - throw new CommandExecutionException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.unzipped.quota_exceeded"), bytesToHumanReadable(storageQuotaLimit)), this); - } + try (var zipFile = openZipFile(tempFile, charset)) { + var zipEntries = filteredZipEntries(zipFile); + for (var entry : zipEntries) { + logger.fine("inside first zip pass; this entry: " + entry.getName()); + numberOfUnpackableFiles++; + if (numberOfUnpackableFiles > fileNumberLimit) { + logger.warning("Zip upload - too many files in the zip to process individually."); + warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit + + "); please upload a zip archive with fewer files, if you want them to be ingested " + + "as individual DataFiles."; + throw new IOException(); + } + // In addition to counting the files, we can + // also check the file size while we're here, + // provided the size limit is defined; if a single + // file is above the individual size limit, unzipped, + // we give up on unpacking this zip archive as well: + if (fileSizeLimit != null && entry.getSize() > fileSizeLimit) { + throw new FileExceedsMaxSizeException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(entry.getSize()), bytesToHumanReadable(fileSizeLimit))); + } + // Similarly, we want to check if saving all these unpacked + // files is going to push the disk usage over the + // quota: + if (storageQuotaLimit != null) { + combinedUnzippedFileSize = combinedUnzippedFileSize + entry.getSize(); + if (combinedUnzippedFileSize > storageQuotaLimit) { + //throw new FileExceedsStorageQuotaException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.quota_exceeded"), bytesToHumanReadable(combinedUnzippedFileSize), bytesToHumanReadable(storageQuotaLimit))); + // change of plans: if the unzipped content inside exceeds the remaining quota, + // we reject the upload outright, rather than accepting the zip + // file as is. + throw new CommandExecutionException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.unzipped.quota_exceeded"), bytesToHumanReadable(storageQuotaLimit)), this); } } } - } - - // OK we're still here - that means we can proceed unzipping. - - // Close the ZipFile, re-open as ZipInputStream: - zipFile.close(); - // reset: - combinedUnzippedFileSize = 0L; - - if (charset != null) { - unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile()), charset); - } else { - unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile())); - } - - while (true) { - try { - zipEntry = unZippedIn.getNextEntry(); - } catch (IllegalArgumentException iaex) { - // Note: - // ZipInputStream documentation doesn't even mention that - // getNextEntry() throws an IllegalArgumentException! - // but that's what happens if the file name of the next - // entry is not valid in the current CharSet. - // -- L.A. - warningMessage = "Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is."; - logger.warning(warningMessage); - throw new IOException(); - } + // OK we're still here - that means we can proceed unzipping. - if (zipEntry == null) { - break; - } - // Note that some zip entries may be directories - we - // simply skip them: + // reset: + combinedUnzippedFileSize = 0L; - if (!zipEntry.isDirectory()) { + for (var entry : zipEntries) { if (datafiles.size() > fileNumberLimit) { logger.warning("Zip upload - too many files."); warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit @@ -407,72 +363,55 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException + "as individual DataFiles."; throw new IOException(); } - - String fileEntryName = zipEntry.getName(); + var fileEntryName = entry.getName(); + var shortName = getShortName(fileEntryName); logger.fine("ZipEntry, file: " + fileEntryName); + String storageIdentifier = FileUtil.generateStorageIdentifier(); + File unzippedFile = new File(getFilesTempDirectory() + "/" + storageIdentifier); + Files.copy(zipFile.getInputStream(entry), unzippedFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + // No need to check the size of this unpacked file against the size limit, + // since we've already checked for that in the first pass. + DataFile datafile = FileUtil.createSingleDataFile(version, null, storageIdentifier, shortName, + MIME_TYPE_UNDETERMINED_DEFAULT, + ctxt.systemConfig().getFileFixityChecksumAlgorithm(), null, false); + + if (!fileEntryName.equals(shortName)) { + // If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes), + // we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all + // the leading, trailing and duplicate slashes; then replace all the characters that + // don't pass our validation rules. + String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", ""); + directoryName = StringUtil.sanitizeFileDirectory(directoryName, true); + // if (!"".equals(directoryName)) { + if (!StringUtil.isEmpty(directoryName)) { + logger.fine("setting the directory label to " + directoryName); + datafile.getFileMetadata().setDirectoryLabel(directoryName); + } + } - if (fileEntryName != null && !fileEntryName.equals("")) { - - String shortName = fileEntryName.replaceFirst("^.*[\\/]", ""); - - // Check if it's a "fake" file - a zip archive entry - // created for a MacOS X filesystem element: (these - // start with "._") - if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) { - // OK, this seems like an OK file entry - we'll try - // to read it and create a DataFile with it: - - String storageIdentifier = FileUtil.generateStorageIdentifier(); - File unzippedFile = new File(getFilesTempDirectory() + "/" + storageIdentifier); - Files.copy(unZippedIn, unzippedFile.toPath(), StandardCopyOption.REPLACE_EXISTING); - // No need to check the size of this unpacked file against the size limit, - // since we've already checked for that in the first pass. - - DataFile datafile = FileUtil.createSingleDataFile(version, null, storageIdentifier, shortName, - MIME_TYPE_UNDETERMINED_DEFAULT, - ctxt.systemConfig().getFileFixityChecksumAlgorithm(), null, false); - - if (!fileEntryName.equals(shortName)) { - // If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes), - // we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all - // the leading, trailing and duplicate slashes; then replace all the characters that - // don't pass our validation rules. - String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", ""); - directoryName = StringUtil.sanitizeFileDirectory(directoryName, true); - // if (!"".equals(directoryName)) { - if (!StringUtil.isEmpty(directoryName)) { - logger.fine("setting the directory label to " + directoryName); - datafile.getFileMetadata().setDirectoryLabel(directoryName); - } - } + if (datafile != null) { + // We have created this datafile with the mime type "unknown"; + // Now that we have it saved in a temporary location, + // let's try and determine its real type: + + String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier(); - if (datafile != null) { - // We have created this datafile with the mime type "unknown"; - // Now that we have it saved in a temporary location, - // let's try and determine its real type: - - String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier(); - - try { - recognizedType = determineFileType(unzippedFile, shortName); - // null the File explicitly, to release any open FDs: - unzippedFile = null; - logger.fine("File utility recognized unzipped file as " + recognizedType); - if (recognizedType != null && !recognizedType.equals("")) { - datafile.setContentType(recognizedType); - } - } catch (Exception ex) { - logger.warning("Failed to run the file utility mime type check on file " + fileName); - } - - datafiles.add(datafile); - combinedUnzippedFileSize += datafile.getFilesize(); + try { + recognizedType = determineFileType(unzippedFile, shortName); + // null the File explicitly, to release any open FDs: + unzippedFile = null; + logger.fine("File utility recognized unzipped file as " + recognizedType); + if (recognizedType != null && !recognizedType.equals("")) { + datafile.setContentType(recognizedType); } + } catch (Exception ex) { + logger.warning("Failed to run the file utility mime type check on file " + fileName); } + + datafiles.add(datafile); + combinedUnzippedFileSize += datafile.getFilesize(); } } - unZippedIn.closeEntry(); - } } catch (IOException ioex) { @@ -494,18 +433,7 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException //warningMessage = BundleUtil.getStringFromBundle("file.addreplace.warning.unzip.failed.quota", Arrays.asList(FileSizeChecker.bytesToHumanReadable(storageQuotaLimit))); //datafiles.clear(); throw new CommandExecutionException(fesqx.getMessage(), fesqx, this); - }*/ finally { - if (zipFile != null) { - try { - zipFile.close(); - } catch (Exception zEx) {} - } - if (unZippedIn != null) { - try { - unZippedIn.close(); - } catch (Exception zEx) {} - } - } + }*/ if (!datafiles.isEmpty()) { // remove the uploaded zip file: try { @@ -591,7 +519,8 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException // The try-catch is due to error encountered in using NFS for stocking file, // cf. https://github.com/IQSS/dataverse/issues/5909 try { - FileUtils.deleteDirectory(rezipFolder); + if (rezipFolder!=null) + FileUtils.deleteDirectory(rezipFolder); } catch (IOException ioex) { // do nothing - it's a temp folder. logger.warning("Could not remove temp folder, error message : " + ioex.getMessage()); @@ -730,7 +659,37 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException return CreateDataFileResult.error(fileName, finalType); } // end createDataFiles - + + private static List filteredZipEntries(ZipFile zipFile) { + var entries = Collections.list(zipFile.entries()).stream().filter(e -> { + var entryName = e.getName(); + logger.fine("ZipEntry, file: " + entryName); + return !e.isDirectory() && !entryName.isEmpty() && !isFileToSkip(entryName); + }).toList(); + return entries; + } + + private static ZipFile openZipFile(Path tempFile, Charset charset) throws IOException { + if (charset != null) { + return new ZipFile(tempFile.toFile(), charset); + } + else { + return new ZipFile(tempFile.toFile()); + } + } + + private static boolean isFileToSkip(String fileName) { + // check if it's a "fake" file - a zip archive entry + // created for a MacOS X filesystem element: (these + // start with "._") + var shortName = getShortName(fileName); + return shortName.startsWith("._") || shortName.startsWith(".DS_Store") || "".equals(shortName); + } + + private static String getShortName(String fileName) { + return fileName.replaceFirst("^.*[\\/]", ""); + } + @Override public Map> getRequiredPermissions() { Map> ret = new HashMap<>(); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDataverseAttributeCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDataverseAttributeCommand.java new file mode 100644 index 00000000000..57ac20fcee6 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDataverseAttributeCommand.java @@ -0,0 +1,110 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.Dataverse; +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.engine.command.AbstractCommand; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; +import edu.harvard.iq.dataverse.engine.command.exception.PermissionException; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean; + +import java.util.Collections; + +/** + * Command to update an existing Dataverse attribute. + */ +@RequiredPermissions(Permission.EditDataverse) +public class UpdateDataverseAttributeCommand extends AbstractCommand { + + private static final String ATTRIBUTE_ALIAS = "alias"; + private static final String ATTRIBUTE_NAME = "name"; + private static final String ATTRIBUTE_DESCRIPTION = "description"; + private static final String ATTRIBUTE_AFFILIATION = "affiliation"; + private static final String ATTRIBUTE_FILE_PIDS_ENABLED = "filePIDsEnabled"; + + private final Dataverse dataverse; + private final String attributeName; + private final Object attributeValue; + + public UpdateDataverseAttributeCommand(DataverseRequest request, Dataverse dataverse, String attributeName, Object attributeValue) { + super(request, dataverse); + this.dataverse = dataverse; + this.attributeName = attributeName; + this.attributeValue = attributeValue; + } + + @Override + public Dataverse execute(CommandContext ctxt) throws CommandException { + switch (attributeName) { + case ATTRIBUTE_ALIAS: + case ATTRIBUTE_NAME: + case ATTRIBUTE_DESCRIPTION: + case ATTRIBUTE_AFFILIATION: + setStringAttribute(attributeName, attributeValue); + break; + case ATTRIBUTE_FILE_PIDS_ENABLED: + setBooleanAttributeForFilePIDs(ctxt); + break; + default: + throw new IllegalCommandException("'" + attributeName + "' is not a supported attribute", this); + } + + return ctxt.engine().submit(new UpdateDataverseCommand(dataverse, null, null, getRequest(), null)); + } + + /** + * Helper method to set a string attribute. + * + * @param attributeName The name of the attribute. + * @param attributeValue The value of the attribute (must be a String). + * @throws IllegalCommandException if the provided attribute value is not of String type. + */ + private void setStringAttribute(String attributeName, Object attributeValue) throws IllegalCommandException { + if (!(attributeValue instanceof String stringValue)) { + throw new IllegalCommandException("'" + attributeName + "' requires a string value", this); + } + + switch (attributeName) { + case ATTRIBUTE_ALIAS: + dataverse.setAlias(stringValue); + break; + case ATTRIBUTE_NAME: + dataverse.setName(stringValue); + break; + case ATTRIBUTE_DESCRIPTION: + dataverse.setDescription(stringValue); + break; + case ATTRIBUTE_AFFILIATION: + dataverse.setAffiliation(stringValue); + break; + default: + throw new IllegalCommandException("Unsupported string attribute: " + attributeName, this); + } + } + + /** + * Helper method to handle the "filePIDsEnabled" boolean attribute. + * + * @param ctxt The command context. + * @throws PermissionException if the user doesn't have permission to modify this attribute. + */ + private void setBooleanAttributeForFilePIDs(CommandContext ctxt) throws CommandException { + if (!getRequest().getUser().isSuperuser()) { + throw new PermissionException("You must be a superuser to change this setting", + this, Collections.singleton(Permission.EditDataset), dataverse); + } + if (!ctxt.settings().isTrueForKey(SettingsServiceBean.Key.AllowEnablingFilePIDsPerCollection, false)) { + throw new PermissionException("Changing File PID policy per collection is not enabled on this server", + this, Collections.singleton(Permission.EditDataset), dataverse); + } + + if (!(attributeValue instanceof Boolean)) { + throw new IllegalCommandException("'" + ATTRIBUTE_FILE_PIDS_ENABLED + "' requires a boolean value", this); + } + + dataverse.setFilePIDsEnabled((Boolean) attributeValue); + } +} diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java new file mode 100644 index 00000000000..09563686299 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java @@ -0,0 +1,202 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.FileMetadata; +import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile; +import edu.harvard.iq.dataverse.util.StringUtil; +import java.io.IOException; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.solr.client.solrj.SolrServerException; + +/** + * + * @author landreev + * + * Much simplified version of UpdateDatasetVersionCommand, + * but with some extra twists. The goal is to avoid creating new Dataset and + * DataFile objects, and to instead preserve the database ids of the re-harvested + * datasets and files, whenever possible. This in turn allows us to avoid deleting + * and rebuilding from scratch the Solr documents for these objects. + */ +@RequiredPermissions(Permission.EditDataset) +public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand { + + private static final Logger logger = Logger.getLogger(UpdateHarvestedDatasetCommand.class.getCanonicalName()); + private final DatasetVersion newHarvestedVersion; + final private boolean validateLenient = true; + + public UpdateHarvestedDatasetCommand(Dataset theDataset, DatasetVersion newHarvestedVersion, DataverseRequest aRequest) { + super(aRequest, theDataset); + this.newHarvestedVersion = newHarvestedVersion; + } + + public boolean isValidateLenient() { + return validateLenient; + } + + @Override + public Dataset execute(CommandContext ctxt) throws CommandException { + + Dataset existingDataset = getDataset(); + + if (existingDataset == null + || existingDataset.getId() == null + || !existingDataset.isHarvested() + || existingDataset.getVersions().size() != 1) { + throw new IllegalCommandException("The command can only be called on an existing harvested dataset with only 1 version", this); + } + DatasetVersion existingVersion = existingDataset.getVersions().get(0); + + if (newHarvestedVersion == null || newHarvestedVersion.getId() != null) { + throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this); + } + + newHarvestedVersion.setCreateTime(getTimestamp()); + newHarvestedVersion.setLastUpdateTime(getTimestamp()); + + + Map existingFilesIndex = new HashMap<>(); + + /* + Create a map of the files that are currently part of this existing + harvested dataset. We assume that a harvested file can be uniquely + defined by its storageidentifier. Which, in the case of a datafile + harvested from another Dataverse should be its data access api url. + */ + for (int i = 0; i < existingDataset.getFiles().size(); i++) { + String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier(); + if (!StringUtil.isEmpty(storageIdentifier)) { + existingFilesIndex.put(storageIdentifier, i); + } + } + + /* + Go through the files in the newly-harvested version and check if any of + them are (potentially new/updated) versions of files that we already + have, harvested previously from the same archive location. + */ + for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { + // is it safe to assume that each new FileMetadata will be + // pointing to a non-null DataFile here? + String storageIdentifier = newFileMetadata.getDataFile().getStorageIdentifier(); + if (!StringUtil.isEmpty(storageIdentifier) && existingFilesIndex.containsKey(storageIdentifier)) { + newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>()); + + int fileIndex = existingFilesIndex.get(storageIdentifier); + + // Make sure to update the existing DataFiles that we are going + // to keep in case their newly-harvested versions have different + // checksums, mime types etc. These values are supposed to be + // immutable, normally - but who knows, errors happen, the source + // Dataverse may have had to fix these in their database to + // correct a data integrity issue (for ex.): + existingDataset.getFiles().get(fileIndex).setContentType(newFileMetadata.getDataFile().getContentType()); + existingDataset.getFiles().get(fileIndex).setFilesize(newFileMetadata.getDataFile().getFilesize()); + existingDataset.getFiles().get(fileIndex).setChecksumType(newFileMetadata.getDataFile().getChecksumType()); + existingDataset.getFiles().get(fileIndex).setChecksumValue(newFileMetadata.getDataFile().getChecksumValue()); + + // Point the newly-harvested filemetadata to the existing datafile: + newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex)); + + // Make sure this new FileMetadata is the only one attached to this existing file: + existingDataset.getFiles().get(fileIndex).setFileMetadatas(new ArrayList<>(1)); + existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata); + // (we don't want any cascade relationships left between this existing + // dataset and this version, since we are going to attemp to delete it). + + // Drop the file from the index map: + existingFilesIndex.remove(storageIdentifier); + } + } + + // @todo? check if there's anything special that needs to be done for things + // like file categories + + List solrIdsOfDocumentsToDelete = new ArrayList<>(); + + // Go through the existing files and delete the ones that are + // no longer present in the version that we have just harvesed: + for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) { + DataFile oldDataFile = oldFileMetadata.getDataFile(); + String storageIdentifier = oldDataFile.getStorageIdentifier(); + // Is it still in the existing files map? - that means it is no longer + // present in the newly-harvested version + if (StringUtil.isEmpty(storageIdentifier) || existingFilesIndex.containsKey(storageIdentifier)) { + solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId()); + existingDataset.getFiles().remove(oldDataFile); + // Files from harvested datasets are removed unceremoniously, + // directly in the database. No need to bother calling the + // DeleteFileCommand on them. We'll just need to remember to purge + // them from Solr as well (right below) + ctxt.em().remove(ctxt.em().merge(oldDataFile)); + // (no need to explicitly remove the oldFileMetadata; it will be + // removed with the entire old version is deleted) + } + } + + // purge all the SOLR documents associated with the files + // we have just deleted: + if (!solrIdsOfDocumentsToDelete.isEmpty()) { + ctxt.index().deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); + } + + // ... And now delete the existing version itself: + existingDataset.setVersions(new ArrayList<>()); + existingVersion.setDataset(null); + + existingVersion = ctxt.em().merge(existingVersion); + ctxt.em().remove(existingVersion); + + // Now attach the newly-harvested version to the dataset: + existingDataset.getVersions().add(newHarvestedVersion); + newHarvestedVersion.setDataset(existingDataset); + + // ... There's one more thing to do - go through the new files, + // that are not in the database yet, and make sure they are + // attached to this existing dataset, instead of the dummy temp + // dataset into which they were originally imported: + for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) { + if (newFileMetadata.getDataFile().getId() == null) { + existingDataset.getFiles().add(newFileMetadata.getDataFile()); + newFileMetadata.getDataFile().setOwner(existingDataset); + } + } + + ctxt.em().persist(newHarvestedVersion); + + Dataset savedDataset = ctxt.em().merge(existingDataset); + ctxt.em().flush(); + + return savedDataset; + } + + @Override + public boolean onSuccess(CommandContext ctxt, Object r) { + boolean retVal = true; + Dataset d = (Dataset) r; + + try { + // Note that we index harvested datasets synchronously: + ctxt.index().indexDataset(d, true); + } catch (SolrServerException|IOException solrServerEx) { + logger.log(Level.WARNING, "Exception while trying to index the updated Harvested dataset " + d.getGlobalId().asString(), solrServerEx.getMessage()); + retVal = false; + } + + return retVal; + } +} diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java index 8c5dad237b1..27a2ab99376 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java @@ -100,71 +100,48 @@ public IngestServiceShapefileHelper(File zippedShapefile, File rezipFolder){ //this.processFile(zippedShapefile, rezipFolder); } - - private FileInputStream getFileInputStream(File fileObject){ - if (fileObject==null){ - return null; - } - try { + + private FileInputStream getFileInputStream(File fileObject){ + if (fileObject==null){ + return null; + } + try { return new FileInputStream(fileObject); } catch (FileNotFoundException ex) { logger.severe("Failed to create FileInputStream from File: " + fileObject.getAbsolutePath()); return null; } - } - - private void closeFileInputStream(FileInputStream fis){ - if (fis==null){ - return; - } + } + + private void closeFileInputStream(FileInputStream fis){ + if (fis==null){ + return; + } try { - fis.close(); + fis.close(); } catch (IOException ex) { logger.info("Failed to close FileInputStream"); } - } - + } + public boolean processFile() { if ((!isValidFile(this.zippedShapefile))||(!isValidFolder(this.rezipFolder))){ return false; } - - // (1) Use the ShapefileHandler to the .zip for a shapefile - // - FileInputStream shpfileInputStream = this.getFileInputStream(zippedShapefile); - if (shpfileInputStream==null){ - return false; - } - - this.shpHandler = new ShapefileHandler(shpfileInputStream); - if (!shpHandler.containsShapefile()){ - logger.severe("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here"); - return false; - } - - this.closeFileInputStream(shpfileInputStream); - - // (2) Rezip the shapefile pieces - logger.info("rezipFolder: " + rezipFolder.getAbsolutePath()); - shpfileInputStream = this.getFileInputStream(zippedShapefile); - if (shpfileInputStream==null){ - return false; - } - - boolean rezipSuccess; try { - rezipSuccess = shpHandler.rezipShapefileSets(shpfileInputStream, rezipFolder); + this.shpHandler = new ShapefileHandler(zippedShapefile); + if (!shpHandler.containsShapefile()){ + logger.severe("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here"); + return false; + } + logger.info("rezipFolder: " + rezipFolder.getAbsolutePath()); + return shpHandler.rezipShapefileSets(rezipFolder); } catch (IOException ex) { logger.severe("Shapefile was not correctly unpacked/repacked"); logger.severe("shpHandler message: " + shpHandler.errorMessage); return false; } - - this.closeFileInputStream(shpfileInputStream); - - return rezipSuccess; - // return createDataFiles(rezipFolder); } diff --git a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java index 50c24274bb2..fa56432cc3c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java @@ -7,6 +7,9 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; +import edu.harvard.iq.dataverse.GlobalId; +import edu.harvard.iq.dataverse.pidproviders.PidUtil; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -40,7 +43,8 @@ public class DatasetExternalCitationsServiceBean implements java.io.Serializable Arrays.asList( "cites", "references", - "supplements")); + "supplements", + "is-supplement-to")); static ArrayList outboundRelationships = new ArrayList( Arrays.asList( "is-cited-by", @@ -59,12 +63,11 @@ public List parseCitations(JsonArray citations) { if (inboundRelationships.contains(relationship)) { Dataset localDs = null; if (objectUri.contains("doi")) { - String globalId = objectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); - localDs = datasetService.findByGlobalId(globalId); + localDs = datasetService.findByGlobalId(objectUri); exCit.setDataset(localDs); } exCit.setCitedByUrl(subjectUri); - + if (localDs != null && !exCit.getCitedByUrl().isEmpty()) { datasetExternalCitations.add(exCit); } @@ -72,9 +75,9 @@ public List parseCitations(JsonArray citations) { if (outboundRelationships.contains(relationship)) { Dataset localDs = null; if (subjectUri.contains("doi")) { - String globalId = subjectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); - localDs = datasetService.findByGlobalId(globalId); + localDs = datasetService.findByGlobalId(subjectUri); exCit.setDataset(localDs); + } exCit.setCitedByUrl(objectUri); diff --git a/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java index a74474efa15..5bdbeac031d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/metrics/MetricsServiceBean.java @@ -168,25 +168,12 @@ public long datasetsToMonth(String yyyymm, String dataLocation, Dataverse d) { } } - // Note that this SQL line in the code below: - // datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) - // behaves somewhat counter-intuitively if the versionnumber and/or - // minorversionnumber is/are NULL - it results in an empty string - // (NOT the string "{dataset_id}:", in other words). Some harvested - // versions do not have version numbers (only the ones harvested from - // other Dataverses!) It works fine - // for our purposes below, because we are simply counting the selected - // lines - i.e. we don't care if some of these lines are empty. - // But do not use this notation if you need the values returned to - // meaningfully identify the datasets! - - Query query = em.createNativeQuery( "select count(*)\n" + "from (\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.dataset_id \n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -194,7 +181,7 @@ public long datasetsToMonth(String yyyymm, String dataLocation, Dataverse d) { + ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n ") + "and \n" + dataLocationLine // be careful about adding more and statements after this line. - + "group by dataset_id \n" + + " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n" +") sub_temp" ); logger.log(Level.FINE, "Metric query: {0}", query); @@ -207,15 +194,15 @@ public List datasetsBySubjectToMonth(String yyyymm, String dataLocatio // A published local datasets may have more than one released version! // So that's why we have to jump through some extra hoops below // in order to select the latest one: - String originClause = "(datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in\n" + + String originClause = "(datasetversion.id in\n" + "(\n" + - "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber))\n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n" + " from datasetversion\n" + " join dataset on dataset.id = datasetversion.dataset_id\n" + " where versionstate='RELEASED'\n" + " and dataset.harvestingclient_id is null\n" + " and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n" + - " group by dataset_id\n" + + " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc\n" + "))\n"; if (!DATA_LOCATION_LOCAL.equals(dataLocation)) { // Default api state is DATA_LOCATION_LOCAL @@ -273,7 +260,7 @@ public long datasetsPastDays(int days, String dataLocation, Dataverse d) { Query query = em.createNativeQuery( "select count(*)\n" + "from (\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max\n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id\n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -281,7 +268,7 @@ public long datasetsPastDays(int days, String dataLocation, Dataverse d) { + ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n") + "and \n" + dataLocationLine // be careful about adding more and statements after this line. - + "group by dataset_id \n" + + " order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n" +") sub_temp" ); logger.log(Level.FINE, "Metric query: {0}", query); @@ -322,9 +309,9 @@ public long filesToMonth(String yyyymm, Dataverse d) { + "select count(*)\n" + "from filemetadata\n" + "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n" - + "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n" + + "where datasetversion.id in \n" + "(\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -332,7 +319,7 @@ public long filesToMonth(String yyyymm, Dataverse d) { + ((d == null) ? "" : "and dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n") + "and date_trunc('month', releasetime) <= to_date('" + yyyymm + "','YYYY-MM')\n" + "and dataset.harvestingclient_id is null\n" - + "group by dataset_id \n" + + "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber \n" + ");" ); logger.log(Level.FINE, "Metric query: {0}", query); @@ -345,9 +332,9 @@ public long filesPastDays(int days, Dataverse d) { + "select count(*)\n" + "from filemetadata\n" + "join datasetversion on datasetversion.id = filemetadata.datasetversion_id\n" - + "where datasetversion.dataset_id || ':' || datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber) in \n" + + "where datasetversion.id in \n" + "(\n" - + "select datasetversion.dataset_id || ':' || max(datasetversion.versionnumber + (.1 * datasetversion.minorversionnumber)) as max \n" + + "select DISTINCT ON (datasetversion.dataset_id) datasetversion.id \n" + "from datasetversion\n" + "join dataset on dataset.id = datasetversion.dataset_id\n" + ((d == null) ? "" : "join dvobject on dvobject.id = dataset.id\n") @@ -355,7 +342,7 @@ public long filesPastDays(int days, Dataverse d) { + "and releasetime > current_date - interval '" + days + "' day\n" + ((d == null) ? "" : "AND dvobject.owner_id in (" + getCommaSeparatedIdStringForSubtree(d, "Dataverse") + ")\n") + "and dataset.harvestingclient_id is null\n" - + "group by dataset_id \n" + + "order by datasetversion.dataset_id, datasetversion.versionnumber desc, datasetversion.minorversionnumber desc \n" + ");" ); logger.log(Level.FINE, "Metric query: {0}", query); diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java index f6d142aac96..250eae7e5fc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java @@ -36,9 +36,9 @@ public abstract class AbstractPidProvider implements PidProvider { private String datafilePidFormat = null; - private HashSet managedSet; + protected HashSet managedSet = new HashSet(); - private HashSet excludedSet; + protected HashSet excludedSet = new HashSet(); private String id; private String label; @@ -47,8 +47,6 @@ protected AbstractPidProvider(String id, String label, String protocol) { this.id = id; this.label = label; this.protocol = protocol; - this.managedSet = new HashSet(); - this.excludedSet = new HashSet(); } protected AbstractPidProvider(String id, String label, String protocol, String authority, String shoulder, @@ -60,8 +58,12 @@ protected AbstractPidProvider(String id, String label, String protocol, String a this.shoulder = shoulder; this.identifierGenerationStyle = identifierGenerationStyle; this.datafilePidFormat = datafilePidFormat; - this.managedSet = new HashSet(Arrays.asList(managedList.split(",\\s"))); - this.excludedSet = new HashSet(Arrays.asList(excludedList.split(",\\s"))); + if(!managedList.isEmpty()) { + this.managedSet.addAll(Arrays.asList(managedList.split(",\\s"))); + } + if(!excludedList.isEmpty()) { + this.excludedSet.addAll(Arrays.asList(excludedList.split(",\\s"))); + } if (logger.isLoggable(Level.FINE)) { Iterator iter = managedSet.iterator(); while (iter.hasNext()) { @@ -313,10 +315,17 @@ protected GlobalId parsePersistentId(String protocol, String identifierString) { } public GlobalId parsePersistentId(String protocol, String authority, String identifier) { + return parsePersistentId(protocol, authority, identifier, false); + } + + public GlobalId parsePersistentId(String protocol, String authority, String identifier, boolean isCaseInsensitive) { logger.fine("Parsing: " + protocol + ":" + authority + getSeparator() + identifier + " in " + getId()); if (!PidProvider.isValidGlobalId(protocol, authority, identifier)) { return null; } + if(isCaseInsensitive) { + identifier = identifier.toUpperCase(); + } // Check authority/identifier if this is a provider that manages specific // identifiers // /is not one of the unmanaged providers that has null authority @@ -333,7 +342,7 @@ public GlobalId parsePersistentId(String protocol, String authority, String iden logger.fine("managed in " + getId() + ": " + getManagedSet().contains(cleanIdentifier)); logger.fine("excluded from " + getId() + ": " + getExcludedSet().contains(cleanIdentifier)); - if (!(((authority.equals(getAuthority()) && identifier.startsWith(getShoulder())) + if (!(((authority.equals(getAuthority()) && identifier.startsWith(getShoulder().toUpperCase())) || getManagedSet().contains(cleanIdentifier)) && !getExcludedSet().contains(cleanIdentifier))) { return null; } diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java index 02a7dedce47..70ce1ec4c14 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.pidproviders.doi; import java.util.Arrays; +import java.util.HashSet; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; @@ -26,9 +27,24 @@ public abstract class AbstractDOIProvider extends AbstractPidProvider { public AbstractDOIProvider(String id, String label, String providerAuthority, String providerShoulder, String identifierGenerationStyle, String datafilePidFormat, String managedList, String excludedList) { super(id, label, DOI_PROTOCOL, providerAuthority, providerShoulder, identifierGenerationStyle, datafilePidFormat, managedList, excludedList); + //Create case insensitive (converted toUpperCase) managedSet and excludedSet + managedSet = clean(managedSet, "managed"); + excludedSet = clean(excludedSet, "excluded"); } - //For Unmanged provider + private HashSet clean(HashSet originalSet, String setName) { + HashSet cleanSet = new HashSet(); + for(String entry: originalSet) { + if(entry.startsWith(DOI_PROTOCOL)) { + cleanSet.add(DOI_PROTOCOL + entry.substring(DOI_PROTOCOL.length()).toUpperCase()); + } else { + logger.warning("Non-DOI found in " + setName + " set of pidProvider id: " + getId() + ": " + entry + ". Entry is being dropped."); + } + } + return cleanSet; + } + + //For Unmanaged provider public AbstractDOIProvider(String name, String label) { super(name, label, DOI_PROTOCOL); } @@ -67,7 +83,7 @@ public GlobalId parsePersistentId(String protocol, String authority, String iden if (!DOI_PROTOCOL.equals(protocol)) { return null; } - return super.parsePersistentId(protocol, authority, identifier); + return super.parsePersistentId(protocol, authority, identifier, true); } public String getUrlPrefix() { diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java index 5630844fb32..b07cd027a01 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java @@ -226,8 +226,7 @@ protected String getProviderKeyName() { @Override public String getProviderType() { - // TODO Auto-generated method stub - return null; + return TYPE; } public String getMdsUrl() { diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index a8cf9ed519b..d0dcf3461cf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -420,7 +420,7 @@ synchronized private static Dataset getNextToIndex(Long id, Dataset d) { public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { try { acquirePermitFromSemaphore(); - doAyncIndexDataset(dataset, doNormalSolrDocCleanUp); + doAsyncIndexDataset(dataset, doNormalSolrDocCleanUp); } catch (InterruptedException e) { String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString(); failureLogText += "\r\n" + e.getLocalizedMessage(); @@ -430,7 +430,7 @@ public void asyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { } } - private void doAyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { + private void doAsyncIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) { Long id = dataset.getId(); Dataset next = getNextToIndex(id, dataset); // if there is an ongoing index job for this dataset, next is null (ongoing index job will reindex the newest version after current indexing finishes) while (next != null) { @@ -451,7 +451,7 @@ public void asyncIndexDatasetList(List datasets, boolean doNormalSolrDo for(Dataset dataset : datasets) { try { acquirePermitFromSemaphore(); - doAyncIndexDataset(dataset, true); + doAsyncIndexDataset(dataset, true); } catch (InterruptedException e) { String failureLogText = "Indexing failed: interrupted. You can kickoff a re-index of this dataset with: \r\n curl http://localhost:8080/api/admin/index/datasets/" + dataset.getId().toString(); failureLogText += "\r\n" + e.getLocalizedMessage(); @@ -2407,6 +2407,11 @@ public void deleteHarvestedDocuments(Dataset harvestedDataset) { solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + datafile.getId()); } + deleteHarvestedDocuments(solrIdsOfDocumentsToDelete); + } + + public void deleteHarvestedDocuments(List solrIdsOfDocumentsToDelete) { + logger.fine("attempting to delete the following documents from the index: " + StringUtils.join(solrIdsOfDocumentsToDelete, ",")); IndexResponse resultOfAttemptToDeleteDocuments = solrIndexService.deleteMultipleSolrIds(solrIdsOfDocumentsToDelete); logger.fine("result of attempt to delete harvested documents: " + resultOfAttemptToDeleteDocuments + "\n"); diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchServiceBean.java index ee93c49ad34..de75c88009f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchServiceBean.java @@ -35,6 +35,8 @@ import jakarta.inject.Inject; import jakarta.inject.Named; import jakarta.persistence.NoResultException; + +import org.apache.commons.lang3.StringUtils; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.client.solrj.SolrServerException; @@ -52,6 +54,8 @@ public class SearchServiceBean { private static final Logger logger = Logger.getLogger(SearchServiceBean.class.getCanonicalName()); + private static final String ALL_GROUPS = "*"; + /** * We're trying to make the SearchServiceBean lean, mean, and fast, with as * few injections of EJBs as possible. @@ -182,6 +186,7 @@ public SolrQueryResponse search( SolrQuery solrQuery = new SolrQuery(); query = SearchUtil.sanitizeQuery(query); + solrQuery.setQuery(query); if (sortField != null) { // is it ok not to specify any sort? - there are cases where we @@ -323,24 +328,13 @@ public SolrQueryResponse search( } } - //I'm not sure if just adding null here is good for hte permissions system... i think it needs something - if(dataverses != null) { - for(Dataverse dataverse : dataverses) { - // ----------------------------------- - // PERMISSION FILTER QUERY - // ----------------------------------- - String permissionFilterQuery = this.getPermissionFilterQuery(dataverseRequest, solrQuery, dataverse, onlyDatatRelatedToMe, addFacets); - if (permissionFilterQuery != null) { - solrQuery.addFilterQuery(permissionFilterQuery); - } - } - } else { - String permissionFilterQuery = this.getPermissionFilterQuery(dataverseRequest, solrQuery, null, onlyDatatRelatedToMe, addFacets); - if (permissionFilterQuery != null) { - solrQuery.addFilterQuery(permissionFilterQuery); - } + // ----------------------------------- + // PERMISSION FILTER QUERY + // ----------------------------------- + String permissionFilterQuery = this.getPermissionFilterQuery(dataverseRequest, solrQuery, onlyDatatRelatedToMe, addFacets); + if (!StringUtils.isBlank(permissionFilterQuery)) { + solrQuery.addFilterQuery(permissionFilterQuery); } - /** * @todo: do sanity checking... throw error if negative @@ -994,7 +988,7 @@ public String getCapitalizedName(String name) { * * @return */ - private String getPermissionFilterQuery(DataverseRequest dataverseRequest, SolrQuery solrQuery, Dataverse dataverse, boolean onlyDatatRelatedToMe, boolean addFacets) { + private String getPermissionFilterQuery(DataverseRequest dataverseRequest, SolrQuery solrQuery, boolean onlyDatatRelatedToMe, boolean addFacets) { User user = dataverseRequest.getUser(); if (user == null) { @@ -1003,38 +997,22 @@ private String getPermissionFilterQuery(DataverseRequest dataverseRequest, SolrQ if (solrQuery == null) { throw new NullPointerException("solrQuery cannot be null"); } - /** - * @todo For people who are not logged in, should we show stuff indexed - * with "AllUsers" group or not? If so, uncomment the allUsersString - * stuff below. - */ -// String allUsersString = IndexServiceBean.getGroupPrefix() + AllUsers.get().getAlias(); -// String publicOnly = "{!join from=" + SearchFields.DEFINITION_POINT + " to=id}" + SearchFields.DISCOVERABLE_BY + ":(" + IndexServiceBean.getPublicGroupString() + " OR " + allUsersString + ")"; - String publicOnly = "{!join from=" + SearchFields.DEFINITION_POINT + " to=id}" + SearchFields.DISCOVERABLE_BY + ":(" + IndexServiceBean.getPublicGroupString() + ")"; -// String publicOnly = "{!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getPublicGroupString(); - // initialize to public only to be safe - String dangerZoneNoSolrJoin = null; - + if (user instanceof PrivateUrlUser) { user = GuestUser.get(); } - AuthenticatedUser au = null; + ArrayList groupList = new ArrayList(); + AuthenticatedUser au = null; Set groups; - - if (user instanceof GuestUser) { - // Yes, GuestUser may be part of one or more groups; such as IP Groups. - groups = groupService.collectAncestors(groupService.groupsFor(dataverseRequest)); - } else { - if (!(user instanceof AuthenticatedUser)) { - logger.severe("Should never reach here. A User must be an AuthenticatedUser or a Guest"); - throw new IllegalStateException("A User must be an AuthenticatedUser or a Guest"); - } + boolean avoidJoin = FeatureFlags.AVOID_EXPENSIVE_SOLR_JOIN.enabled(); + + if (user instanceof AuthenticatedUser) { au = (AuthenticatedUser) user; - + // ---------------------------------------------------- - // (3) Is this a Super User? + // Is this a Super User? // If so, they can see everything // ---------------------------------------------------- if (au.isSuperuser()) { @@ -1042,187 +1020,76 @@ private String getPermissionFilterQuery(DataverseRequest dataverseRequest, SolrQ // to see everything in Solr with no regard to permissions. But it's // been this way since Dataverse 4.0. So relax. :) - return dangerZoneNoSolrJoin; + return buildPermissionFilterQuery(avoidJoin, ALL_GROUPS); } - + // ---------------------------------------------------- - // (4) User is logged in AND onlyDatatRelatedToMe == true + // User is logged in AND onlyDatatRelatedToMe == true // Yes, give back everything -> the settings will be in - // the filterqueries given to search + // the filterqueries given to search // ---------------------------------------------------- if (onlyDatatRelatedToMe == true) { if (systemConfig.myDataDoesNotUsePermissionDocs()) { logger.fine("old 4.2 behavior: MyData is not using Solr permission docs"); - return dangerZoneNoSolrJoin; + return buildPermissionFilterQuery(avoidJoin, ALL_GROUPS); } else { // fall-through logger.fine("new post-4.2 behavior: MyData is using Solr permission docs"); } } - // ---------------------------------------------------- - // (5) Work with Authenticated User who is not a Superuser + // Work with Authenticated User who is not a Superuser // ---------------------------------------------------- - - groups = groupService.collectAncestors(groupService.groupsFor(dataverseRequest)); + groupList.add(IndexServiceBean.getGroupPerUserPrefix() + au.getId()); } - if (FeatureFlags.AVOID_EXPENSIVE_SOLR_JOIN.enabled()) { - /** - * Instead of doing a super expensive join, we will rely on the - * new boolean field PublicObject:true for public objects. This field - * is indexed on the content document itself, rather than a permission - * document. An additional join will be added only for any extra, - * more restricted groups that the user may be part of. - * **Note the experimental nature of this optimization**. - */ - StringBuilder sb = new StringBuilder(); - StringBuilder sbgroups = new StringBuilder(); - - // All users, guests and authenticated, should see all the - // documents marked as publicObject_b:true, at least: - sb.append(SearchFields.PUBLIC_OBJECT + ":" + true); + // In addition to the user referenced directly, we will also + // add joins on all the non-public groups that may exist for the + // user: - // One or more groups *may* also be available for this user. Once again, - // do note that Guest users may be part of some groups, such as - // IP groups. - - int groupCounter = 0; + // Authenticated users, *and the GuestUser*, may be part of one or more groups; such + // as IP Groups. + groups = groupService.collectAncestors(groupService.groupsFor(dataverseRequest)); - // An AuthenticatedUser should also be able to see all the content - // on which they have direct permissions: - if (au != null) { - groupCounter++; - sbgroups.append(IndexServiceBean.getGroupPerUserPrefix() + au.getId()); - } - - // In addition to the user referenced directly, we will also - // add joins on all the non-public groups that may exist for the - // user: - for (Group group : groups) { - String groupAlias = group.getAlias(); - if (groupAlias != null && !groupAlias.isEmpty() && !groupAlias.startsWith("builtIn")) { - groupCounter++; - if (groupCounter > 1) { - sbgroups.append(" OR "); - } - sbgroups.append(IndexServiceBean.getGroupPrefix() + groupAlias); - } - } - - if (groupCounter > 1) { - // If there is more than one group, the parentheses must be added: - sbgroups.insert(0, "("); - sbgroups.append(")"); - } - - if (groupCounter > 0) { - // If there are any groups for this user, an extra join must be - // added to the query, and the extra sub-query must be added to - // the combined Solr query: - sb.append(" OR {!join from=" + SearchFields.DEFINITION_POINT + " to=id v=$q1}"); - // Add the subquery to the combined Solr query: - solrQuery.setParam("q1", SearchFields.DISCOVERABLE_BY + ":" + sbgroups.toString()); - logger.info("The sub-query q1 set to " + SearchFields.DISCOVERABLE_BY + ":" + sbgroups.toString()); - } - - String ret = sb.toString(); - logger.fine("Returning experimental query: " + ret); - return ret; - } - - // END OF EXPERIMENTAL OPTIMIZATION - - // Old, un-optimized way of handling permissions. - // Largely left intact, minus the lookups that have already been performed - // above. - - // ---------------------------------------------------- - // (1) Is this a GuestUser? - // ---------------------------------------------------- - if (user instanceof GuestUser) { - - StringBuilder sb = new StringBuilder(); - - String groupsFromProviders = ""; - for (Group group : groups) { - logger.fine("found group " + group.getIdentifier() + " with alias " + group.getAlias()); - String groupAlias = group.getAlias(); - if (groupAlias != null && !groupAlias.isEmpty()) { - sb.append(" OR "); - // i.e. group_builtIn/all-users, ip/ipGroup3 - sb.append(IndexServiceBean.getGroupPrefix()).append(groupAlias); - } - } - groupsFromProviders = sb.toString(); - logger.fine("groupsFromProviders:" + groupsFromProviders); - String guestWithGroups = "{!join from=" + SearchFields.DEFINITION_POINT + " to=id}" + SearchFields.DISCOVERABLE_BY + ":(" + IndexServiceBean.getPublicGroupString() + groupsFromProviders + ")"; - logger.fine(guestWithGroups); - return guestWithGroups; - } - - // ---------------------------------------------------- - // (5) Work with Authenticated User who is not a Superuser - // ---------------------------------------------------- - // It was already confirmed, that if the user is not GuestUser, we - // have an AuthenticatedUser au which is not null. - /** - * @todo all this code needs cleanup and clarification. - */ - /** - * Every AuthenticatedUser is part of a "User Private Group" (UGP), a - * concept we borrow from RHEL: - * https://access.redhat.com/site/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Deployment_Guide/ch-Managing_Users_and_Groups.html#s2-users-groups-private-groups - */ - /** - * @todo rename this from publicPlusUserPrivateGroup. Confusing - */ - // safe default: public only - String publicPlusUserPrivateGroup = publicOnly; -// + (onlyDatatRelatedToMe ? "" : (publicOnly + " OR ")) -// + "{!join from=" + SearchFields.GROUPS + " to=" + SearchFields.PERMS + "}id:" + IndexServiceBean.getGroupPerUserPrefix() + au.getId() + ")"; - -// /** -// * @todo add onlyDatatRelatedToMe option into the experimental JOIN -// * before enabling it. -// */ - /** - * From a search perspective, we don't care about if the group was - * created within one dataverse or another. We just want a list of *all* - * the groups the user is part of. We are greedy. We want all BuiltIn - * Groups, Shibboleth Groups, IP Groups, "system" groups, everything. - * - * A JOIN on "permission documents" will determine if the user can find - * a given "content document" (dataset version, etc) in Solr. - */ - String groupsFromProviders = ""; - StringBuilder sb = new StringBuilder(); for (Group group : groups) { - logger.fine("found group " + group.getIdentifier() + " with alias " + group.getAlias()); String groupAlias = group.getAlias(); - if (groupAlias != null && !groupAlias.isEmpty()) { - sb.append(" OR "); - // i.e. group_builtIn/all-users, group_builtIn/authenticated-users, group_1-explictGroup1, group_shib/2 - sb.append(IndexServiceBean.getGroupPrefix() + groupAlias); + if (groupAlias != null && !groupAlias.isEmpty() && (!avoidJoin || !groupAlias.startsWith("builtIn"))) { + groupList.add(IndexServiceBean.getGroupPrefix() + groupAlias); } } - groupsFromProviders = sb.toString(); - logger.fine(groupsFromProviders); - if (true) { - /** - * @todo get rid of "experimental" in name - */ - String experimentalJoin = "{!join from=" + SearchFields.DEFINITION_POINT + " to=id}" + SearchFields.DISCOVERABLE_BY + ":(" + IndexServiceBean.getPublicGroupString() + " OR " + IndexServiceBean.getGroupPerUserPrefix() + au.getId() + groupsFromProviders + ")"; - publicPlusUserPrivateGroup = experimentalJoin; + if (!avoidJoin) { + // Add the public group + groupList.add(0, IndexServiceBean.getPublicGroupString()); + } + + String groupString = null; + //If we have additional groups, format them correctly into a search string, with parens if there is more than one + if (groupList.size() > 1) { + groupString = "(" + StringUtils.join(groupList, " OR ") + ")"; + } else if (groupList.size() == 1) { + groupString = groupList.get(0); } - - //permissionFilterQuery = publicPlusUserPrivateGroup; - logger.fine(publicPlusUserPrivateGroup); - - return publicPlusUserPrivateGroup; - + logger.fine("Groups: " + groupString); + String permissionQuery = buildPermissionFilterQuery(avoidJoin, groupString); + logger.fine("Permission Query: " + permissionQuery); + return permissionQuery; } + private String buildPermissionFilterQuery(boolean avoidJoin, String permissionFilterGroups) { + String query = (avoidJoin&& !isAllGroups(permissionFilterGroups)) ? SearchFields.PUBLIC_OBJECT + ":" + true : ""; + if (permissionFilterGroups != null && !isAllGroups(permissionFilterGroups)) { + if (!query.isEmpty()) { + query = "(" + query + " OR " + "{!join from=" + SearchFields.DEFINITION_POINT + " to=id}" + SearchFields.DISCOVERABLE_BY + ":" + permissionFilterGroups + ")"; + } else { + query = "{!join from=" + SearchFields.DEFINITION_POINT + " to=id}" + SearchFields.DISCOVERABLE_BY + ":" + permissionFilterGroups; + } + } + return query; + } + + private boolean isAllGroups(String groups) { + return (groups!=null &&groups.equals(ALL_GROUPS)); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index a0c32d5c8ce..991682ec8e8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -525,15 +525,18 @@ public static String determineFileType(File f, String fileName) throws IOExcepti // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile //logger.info("Checking for shapefile"); - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(f)); + ShapefileHandler shp_handler = new ShapefileHandler(f); if (shp_handler.containsShapefile()){ // logger.info("------- shapefile FOUND ----------"); fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE; //"application/zipped-shapefile"; } - - Optional bagItFileHandler = CDI.current().select(BagItFileHandlerFactory.class).get().getBagItFileHandler(); - if(bagItFileHandler.isPresent() && bagItFileHandler.get().isBagItPackage(fileName, f)) { - fileType = BagItFileHandler.FILE_TYPE; + try { + Optional bagItFileHandler = CDI.current().select(BagItFileHandlerFactory.class).get().getBagItFileHandler(); + if (bagItFileHandler.isPresent() && bagItFileHandler.get().isBagItPackage(fileName, f)) { + fileType = BagItFileHandler.FILE_TYPE; + } + } catch (Exception e) { + logger.warning("Error checking for BagIt package: " + e.getMessage()); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java index bb916cc3906..2b54f7a3bfe 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java @@ -1,23 +1,21 @@ package edu.harvard.iq.dataverse.util; import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.Date; import java.util.ArrayList; import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; -import java.util.zip.ZipException; +import java.util.zip.ZipFile; import java.util.HashMap; import java.util.*; import java.nio.file.Files; import java.nio.file.Paths; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; -import java.util.logging.Level; + import java.util.logging.Logger; import org.apache.commons.io.FileUtils; @@ -43,11 +41,10 @@ * "shape1.pdf", "README.md", "shape_notes.txt" * * Code Example: - * FileInputStream shp_file_input_stream = new FileInputStream(new File("zipped_shapefile.zip")) - * ShapefileHandler shp_handler = new ShapefileHandler(shp_file_input_stream); + * ShapefileHandler shp_handler = new ShapefileHandler(new File("zipped_shapefile.zip")); * if (shp_handler.containsShapefile()){ * File rezip_folder = new File("~/folder_for_rezipping"); - * boolean rezip_success = shp_handler.rezipShapefileSets(shp_file_input_stream, rezip_folder ); + * boolean rezip_success = shp_handler.rezipShapefileSets(rezip_folder ); * if (!rezip_success){ * // rezip failed, should be an error message (String) available System.out.println(shp_handler.error_message); @@ -74,7 +71,7 @@ public class ShapefileHandler{ public static final String SHP_XML_EXTENSION = "shp.xml"; public static final String BLANK_EXTENSION = "__PLACEHOLDER-FOR-BLANK-EXTENSION__"; public static final List SHAPEFILE_ALL_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj", "sbn", "sbx", "fbn", "fbx", "ain", "aih", "ixs", "mxs", "atx", "cpg", "qpj", "qmd", SHP_XML_EXTENSION); - + private final File zipFile; public boolean DEBUG = false; private boolean zipFileProcessed = false; @@ -97,9 +94,6 @@ public class ShapefileHandler{ private Map> fileGroups = new HashMap<>(); private List finalRezippedFiles = new ArrayList<>(); - - private String outputFolder = "unzipped"; - private String rezippedFolder = "rezipped"; // Debug helper private void msg(String s){ @@ -116,40 +110,28 @@ private void msgt(String s){ } /* - Constructor, start with filename - */ - public ShapefileHandler(String filename){ - - if (filename==null){ - this.addErrorMessage("The filename was null"); - return; - } - - FileInputStream zip_file_stream; - try { - zip_file_stream = new FileInputStream(new File(filename)); - } catch (FileNotFoundException ex) { - this.addErrorMessage("The file was not found"); + Constructor, start with File + */ + public ShapefileHandler(File zip_file) throws IOException { + zipFile = zip_file; + if (zip_file == null) { + this.addErrorMessage("The file was null"); return; } - - this.examineZipfile(zip_file_stream); - } - - - /* - Constructor, start with FileInputStream - */ - public ShapefileHandler(FileInputStream zip_file_stream){ - - if (zip_file_stream==null){ - this.addErrorMessage("The zip_file_stream was null"); - return; + try (var zip_file_object = new ZipFile(zip_file)) { + this.examineZipfile(zip_file_object); + } + catch (FileNotFoundException ex) { + // While this constructor had a FileInputStream as argument: + // FileUtil.determineFileType threw this exception before calling the constructor with a FileInputStream + // IngestServiceShapefileHelper.processFile won´t call this constructor if the file is not valid hence does not exist. + // When the file would have disappeared in the meantime, it would have produced a slightly different error message. + logger.severe("File not found: " + zip_file.getAbsolutePath()); + throw ex; } - this.examineZipfile(zip_file_stream); } - + public List getFinalRezippedFiles(){ return this.finalRezippedFiles; } @@ -291,26 +273,19 @@ inside the uploaded zip file (issue #6873). To achieve this, we recreate subfolders in the FileMetadata of the newly created DataFiles. (-- L.A. 09/2020) */ - private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File target_directory){ + private boolean unzipFilesToDirectory(ZipFile zipfileInput, File target_directory){ logger.fine("unzipFilesToDirectory: " + target_directory.getAbsolutePath() ); - if (zipfile_input_stream== null){ - this.addErrorMessage("unzipFilesToDirectory. The zipfile_input_stream is null."); - return false; - } if (!target_directory.isDirectory()){ this.addErrorMessage("This directory does not exist: " + target_directory.getAbsolutePath()); return false; } - List unzippedFileNames = new ArrayList<>(); - - ZipInputStream zipStream = new ZipInputStream(zipfile_input_stream); + List unzippedFileNames = new ArrayList<>(); + - ZipEntry origEntry; - byte[] buffer = new byte[2048]; try { - while((origEntry = zipStream.getNextEntry())!=null){ + for(var origEntry : Collections.list(zipfileInput.entries())){ String zentryFileName = origEntry.getName(); logger.fine("\nOriginal entry name: " + origEntry); @@ -360,15 +335,10 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File unzippedFileNames.add(outpath); } logger.fine("Write zip file: " + outpath); - FileOutputStream fileOutputStream; - long fsize = 0; - fileOutputStream = new FileOutputStream(outpath); - int len;// = 0; - while ((len = zipStream.read(buffer)) > 0){ - fileOutputStream.write(buffer, 0, len); - fsize+=len; - } // end while - fileOutputStream.close(); + try(var inputStream = zipfileInput.getInputStream(origEntry)) { + Files.createDirectories(new File(outpath).getParentFile().toPath()); + Files.copy(inputStream, Path.of(outpath), StandardCopyOption.REPLACE_EXISTING); + } } // end outer while } catch (IOException ex) { for (StackTraceElement el : ex.getStackTrace()){ @@ -377,19 +347,13 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File this.addErrorMessage("Failed to open ZipInputStream entry" + ex.getMessage()); return false; } - - try { - zipStream.close(); - } catch (IOException ex) { - Logger.getLogger(ShapefileHandler.class.getName()).log(Level.SEVERE, null, ex); - } - return true; + return true; } /* Rezip the shapefile(s) into a given directory Assumes that the zipfile_input_stream has already been checked! */ - public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rezippedFolder) throws IOException{ + public boolean rezipShapefileSets(File rezippedFolder) throws IOException{ logger.fine("rezipShapefileSets"); //msgt("rezipShapefileSets"); if (!this.zipFileProcessed){ @@ -400,10 +364,6 @@ public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rez this.addErrorMessage("There are no shapefiles here!"); return false; } - if (zipfile_input_stream== null){ - this.addErrorMessage("The zipfile_input_stream is null."); - return false; - } if (rezippedFolder == null){ this.addErrorMessage("The rezippedFolder is null."); return false; @@ -433,9 +393,11 @@ public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rez // Unzip files! - if (!this.unzipFilesToDirectory(zipfile_input_stream, dir_for_unzipping)){ - this.addErrorMessage("Failed to unzip files."); - return false; + try(var zipfileObject = new ZipFile(zipFile)) { + if (!this.unzipFilesToDirectory(zipfileObject, dir_for_unzipping)) { + this.addErrorMessage("Failed to unzip files."); + return false; + } } // Redistribute files! String target_dirname = rezippedFolder.getAbsolutePath(); @@ -681,27 +643,19 @@ private boolean isFileToSkip(String fname){ /************************************** * Iterate through the zip file contents. * Does it contain any shapefiles? - * - * @param FileInputStream zip_file_stream */ - private boolean examineZipfile(FileInputStream zip_file_stream){ + private boolean examineZipfile(ZipFile zip_file){ // msgt("examineZipfile"); - - if (zip_file_stream==null){ - this.addErrorMessage("The zip file stream was null"); - return false; - } - + // Clear out file lists this.filesListInDir.clear(); this.filesizeHash.clear(); this.fileGroups.clear(); - try{ - ZipInputStream zipStream = new ZipInputStream(zip_file_stream); - ZipEntry entry; - List hiddenDirectories = new ArrayList<>(); - while((entry = zipStream.getNextEntry())!=null){ + try{ + List hiddenDirectories = new ArrayList<>(); + for(var entry : Collections.list(zip_file.entries())){ + String zentryFileName = entry.getName(); boolean isDirectory = entry.isDirectory(); @@ -748,8 +702,6 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ this.filesizeHash.put(unzipFilePath, entry.getSize()); } } // end while - - zipStream.close(); if (this.filesListInDir.isEmpty()){ errorMessage = "No files in zipStream"; @@ -759,13 +711,8 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ this.zipFileProcessed = true; return true; - }catch(ZipException ex){ - this.addErrorMessage("ZipException"); - msgt("ZipException"); - return false; - }catch(IOException ex){ - //ex.printStackTrace(); + //ex.printStackTrace(); this.addErrorMessage("IOException File name"); msgt("IOException"); return false; @@ -773,9 +720,6 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ this.addErrorMessage("IllegalArgumentException when parsing zipfile"); msgt("IllegalArgumentException when parsing zipfile"); return false; - - }finally{ - } } // end examineFile diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DataversesIT.java b/src/test/java/edu/harvard/iq/dataverse/api/DataversesIT.java index 8c6a8244af1..6a040f27786 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/DataversesIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/DataversesIT.java @@ -819,10 +819,9 @@ public void testImport() throws IOException, InterruptedException { Response deleteUserResponse = UtilIT.deleteUser(username); assertEquals(200, deleteUserResponse.getStatusCode()); } - - @Test - public void testAttributesApi() throws Exception { + @Test + public void testAttributesApi() { Response createUser = UtilIT.createRandomUser(); String apiToken = UtilIT.getApiTokenFromResponse(createUser); @@ -837,30 +836,70 @@ public void testAttributesApi() throws Exception { String collectionAlias = UtilIT.getAliasFromResponse(createDataverseResponse); String newCollectionAlias = collectionAlias + "RENAMED"; - - // Change the alias of the collection: - - Response changeAttributeResp = UtilIT.setCollectionAttribute(collectionAlias, "alias", newCollectionAlias, apiToken); - changeAttributeResp.prettyPrint(); - + + // Change the name of the collection: + + String newCollectionName = "Renamed Name"; + Response changeAttributeResp = UtilIT.setCollectionAttribute(collectionAlias, "name", newCollectionName, apiToken); changeAttributeResp.then().assertThat() .statusCode(OK.getStatusCode()) .body("message.message", equalTo("Update successful")); - - // Check on the collection, under the new alias: - + + // Change the description of the collection: + + String newDescription = "Renamed Description"; + changeAttributeResp = UtilIT.setCollectionAttribute(collectionAlias, "description", newDescription, apiToken); + changeAttributeResp.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("message.message", equalTo("Update successful")); + + // Change the affiliation of the collection: + + String newAffiliation = "Renamed Affiliation"; + changeAttributeResp = UtilIT.setCollectionAttribute(collectionAlias, "affiliation", newAffiliation, apiToken); + changeAttributeResp.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("message.message", equalTo("Update successful")); + + // Cannot update filePIDsEnabled from a regular user: + + changeAttributeResp = UtilIT.setCollectionAttribute(collectionAlias, "filePIDsEnabled", "true", apiToken); + changeAttributeResp.then().assertThat() + .statusCode(UNAUTHORIZED.getStatusCode()); + + // Change the alias of the collection: + + changeAttributeResp = UtilIT.setCollectionAttribute(collectionAlias, "alias", newCollectionAlias, apiToken); + changeAttributeResp.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("message.message", equalTo("Update successful")); + + // Check on the collection, under the new alias: + Response collectionInfoResponse = UtilIT.exportDataverse(newCollectionAlias, apiToken); - collectionInfoResponse.prettyPrint(); - collectionInfoResponse.then().assertThat() .statusCode(OK.getStatusCode()) - .body("data.alias", equalTo(newCollectionAlias)); - + .body("data.alias", equalTo(newCollectionAlias)) + .body("data.name", equalTo(newCollectionName)) + .body("data.description", equalTo(newDescription)) + .body("data.affiliation", equalTo(newAffiliation)); + // Delete the collection (again, using its new alias): - + Response deleteCollectionResponse = UtilIT.deleteDataverse(newCollectionAlias, apiToken); - deleteCollectionResponse.prettyPrint(); assertEquals(OK.getStatusCode(), deleteCollectionResponse.getStatusCode()); + + // Cannot update root collection from a regular user: + + changeAttributeResp = UtilIT.setCollectionAttribute("root", "name", newCollectionName, apiToken); + changeAttributeResp.then().assertThat() + .statusCode(UNAUTHORIZED.getStatusCode()); + + collectionInfoResponse = UtilIT.exportDataverse("root", apiToken); + + collectionInfoResponse.then().assertThat() + .statusCode(OK.getStatusCode()) + .body("data.name", equalTo("Root")); } @Test diff --git a/src/test/java/edu/harvard/iq/dataverse/api/SearchIT.java b/src/test/java/edu/harvard/iq/dataverse/api/SearchIT.java index 3a2b684c421..b03c23cd1e2 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/SearchIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/SearchIT.java @@ -1308,8 +1308,8 @@ public void testSearchFilesAndUrlImages() { .statusCode(200); pathToFile = "src/main/webapp/resources/js/mydata.js"; Response uploadFile = UtilIT.uploadFileViaNative(datasetId.toString(), pathToFile, apiToken); - uploadImage.prettyPrint(); - uploadImage.then().assertThat() + uploadFile.prettyPrint(); + uploadFile.then().assertThat() .statusCode(200); Response publishDataverse = UtilIT.publishDataverseViaSword(dataverseAlias, apiToken); @@ -1337,7 +1337,7 @@ public void testSearchFilesAndUrlImages() { .statusCode(OK.getStatusCode()) .body("data.items[0].type", CoreMatchers.is("dataverse")) .body("data.items[0].url", CoreMatchers.containsString("/dataverse/")) - .body("data.items[0]", CoreMatchers.not(CoreMatchers.hasItem("url_image"))); + .body("data.items[0]", CoreMatchers.not(CoreMatchers.hasItem("image_url"))); searchResp = UtilIT.search("mydata", apiToken); searchResp.prettyPrint(); @@ -1345,6 +1345,6 @@ public void testSearchFilesAndUrlImages() { .statusCode(OK.getStatusCode()) .body("data.items[0].type", CoreMatchers.is("file")) .body("data.items[0].url", CoreMatchers.containsString("/datafile/")) - .body("data.items[0]", CoreMatchers.not(CoreMatchers.hasItem("url_image"))); + .body("data.items[0]", CoreMatchers.not(CoreMatchers.hasItem("image_url"))); } } diff --git a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java new file mode 100644 index 00000000000..f49ebcea39c --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java @@ -0,0 +1,264 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.storageuse.UploadSessionQuotaLimit; +import edu.harvard.iq.dataverse.util.JhoveFileType; +import edu.harvard.iq.dataverse.util.SystemConfig; +import edu.harvard.iq.dataverse.util.testing.JvmSetting; +import edu.harvard.iq.dataverse.util.testing.LocalJvmSettings; +import org.jetbrains.annotations.NotNull; +import org.joda.time.DateTime; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; +import java.security.SecureRandom; +import java.text.MessageFormat; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +import static edu.harvard.iq.dataverse.DataFile.ChecksumType.MD5; +import static org.apache.commons.io.file.FilesUncheck.createDirectories; +import static org.apache.commons.io.file.PathUtils.deleteDirectory; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; + + +@LocalJvmSettings +public class CreateNewDataFilesTest { + // TODO keep constants for annotations in sync with class name + Path testDir = Path.of("target/test/").resolve(getClass().getSimpleName()); + PrintStream original_stderr; + + @BeforeEach + public void cleanTmpDir() throws IOException { + original_stderr = System.err; + if(testDir.toFile().exists()) + deleteDirectory(testDir); + } + + @AfterEach void restoreStderr() { + System.setErr(original_stderr); + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_fails_to_upload_when_tmp_does_not_exist() throws FileNotFoundException { + + mockTmpLookup(); + var cmd = createCmd("scripts/search/data/shape/shapefile.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(true, 0L, MD5, 10)); + + assertThatThrownBy(() -> cmd.execute(ctxt)) + .isInstanceOf(CommandException.class) + .hasMessageContaining("Failed to save the upload as a temp file (temp disk space?)") + .hasRootCauseInstanceOf(NoSuchFileException.class) + .getRootCause() + .hasMessageStartingWith("target/test/CreateNewDataFilesTest/tmp/temp/tmp"); + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_fails_on_size_limit() throws Exception { + createDirectories(Path.of("target/test/CreateNewDataFilesTest/tmp/temp")); + + mockTmpLookup(); + var cmd = createCmd("scripts/search/data/binary/3files.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(true, 50L, MD5, 0)); + try (var mockedStatic = Mockito.mockStatic(JhoveFileType.class)) { + mockedStatic.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + + assertThatThrownBy(() -> cmd.execute(ctxt)) + .isInstanceOf(CommandException.class) + .hasMessage("This file size (462 B) exceeds the size limit of 50 B."); + } + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_loads_individual_files_from_uploaded_zip() throws Exception { + var tempDir = testDir.resolve("tmp/temp"); + createDirectories(tempDir); + + mockTmpLookup(); + var cmd = createCmd("src/test/resources/own-cloud-downloads/greetings.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(false, 1000000L, MD5, 10)); + try (MockedStatic mockedStatic = Mockito.mockStatic(JhoveFileType.class)) { + mockedStatic.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + + // the test + var result = cmd.execute(ctxt); + + assertThat(result.getErrors()).hasSize(0); + assertThat(result.getDataFiles().stream().map(dataFile -> + dataFile.getFileMetadata().getDirectoryLabel() + "/" + dataFile.getDisplayName() + )).containsExactlyInAnyOrder( + "DD-1576/goodbye.txt", "DD-1576/hello.txt" + ); + var storageIds = result.getDataFiles().stream().map(DataFile::getStorageIdentifier).toList(); + assertThat(tempDir.toFile().list()) + .containsExactlyInAnyOrderElementsOf(storageIds); + } + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_rezips_sets_of_shape_files_from_uploaded_zip() throws Exception { + var tempDir = testDir.resolve("tmp/temp"); + createDirectories(tempDir); + + mockTmpLookup(); + var cmd = createCmd("src/test/resources/own-cloud-downloads/shapes.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(false, 100000000L, MD5, 10)); + try (var mockedJHoveFileType = Mockito.mockStatic(JhoveFileType.class)) { + mockedJHoveFileType.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + + // the test + var result = cmd.execute(ctxt); + + assertThat(result.getErrors()).hasSize(0); + assertThat(result.getDataFiles().stream().map(dataFile -> + (dataFile.getFileMetadata().getDirectoryLabel() + "/" + dataFile.getDisplayName()) + .replaceAll(".*/dataDir/", "") + )).containsExactlyInAnyOrder( + "shape1.zip", + "shape2/shape2", + "shape2/shape2.pdf", + "shape2/shape2.txt", + "shape2/shape2.zip", + "extra/shp_dictionary.xls", + "extra/notes", + "extra/README.MD" + ); + var storageIds = result.getDataFiles().stream().map(DataFile::getStorageIdentifier).toList(); + assertThat(tempDir.toFile().list()) + .containsExactlyInAnyOrderElementsOf(storageIds); + } + } + + @Disabled("Too slow. Intended for manual execution.") + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "/tmp/test/CreateNewDataFilesTest/tmp") + public void extract_zip_performance() throws Exception { + /* + Developed to test performance difference between the old implementation with ZipInputStream and the new ZipFile implementation. + Play with numbers depending on: + - the time you want to spend on this test + - how much system stress you want to examine + */ + var nrOfZipFiles = 20; + var avgNrOfFilesPerZip = 300; + var avgFileLength = 5000; + + var tmpUploadStorage = Path.of("/tmp/test/CreateNewDataFilesTest/tmp/temp"); + if(tmpUploadStorage.toFile().exists()) { + deleteDirectory(tmpUploadStorage); + } + createDirectories(tmpUploadStorage); // temp in target would choke intellij + + var chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + var random = new SecureRandom(); + var totalNrOfFiles = 0; + var totalFileSize = 0; + var totalTime = 0L; + var tmp = Path.of(Files.createTempDirectory(null).toString()); + var ctxt = mockCommandContext(mockSysConfig(false, 100000000L, MD5, 10000)); + try (var mockedJHoveFileType = Mockito.mockStatic(JhoveFileType.class)) { + mockedJHoveFileType.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + for (var zipNr = 1; zipNr <= nrOfZipFiles; zipNr++) { + // build the zip + var zip = tmp.resolve(zipNr + "-data.zip"); + var nrOfFilesInZip = random.nextInt(avgNrOfFilesPerZip * 2); + try (var zipStream = new ZipOutputStream(new FileOutputStream(zip.toFile()))) { + for (var fileInZipNr = 1; fileInZipNr <= nrOfFilesInZip; fileInZipNr++) { + // build content for a file + var stringLength = random.nextInt(avgFileLength * 2 -5); + StringBuilder sb = new StringBuilder(stringLength); + for (int i = 1; i <= stringLength; i++) {// zero length causes buffer underflow + sb.append(chars.charAt(random.nextInt(chars.length()))); + } + // add the file to the zip + zipStream.putNextEntry(new ZipEntry(fileInZipNr + ".txt")); + zipStream.write((sb.toString()).getBytes()); + zipStream.closeEntry(); + totalFileSize += stringLength; + } + } + + // upload the zip + var before = DateTime.now(); + var result = createCmd(zip.toString(), mockDatasetVersion(), 1000L, 500L) + .execute(ctxt); + totalTime += DateTime.now().getMillis() - before.getMillis(); + + assertThat(result.getErrors()).hasSize(0); + assertThat(result.getDataFiles()).hasSize(nrOfFilesInZip); + totalNrOfFiles += nrOfFilesInZip; + + // report after each zip to have some data even when aborting a test that takes too long + System.out.println(MessageFormat.format( + "Total time: {0}ms; nr of zips {1} total nr of files {2}; total file size {3}", + totalTime, zipNr, totalNrOfFiles, totalFileSize + )); + } + assertThat(tmpUploadStorage.toFile().list()).hasSize(totalNrOfFiles); + } + } + + private static @NotNull CreateNewDataFilesCommand createCmd(String name, DatasetVersion dsVersion, long allocatedQuotaLimit, long usedQuotaLimit) throws FileNotFoundException { + return new CreateNewDataFilesCommand( + Mockito.mock(DataverseRequest.class), + dsVersion, + new FileInputStream(name), + "example.zip", + "application/zip", + null, + new UploadSessionQuotaLimit(allocatedQuotaLimit, usedQuotaLimit), + "sha"); + } + + private static @NotNull CommandContext mockCommandContext(SystemConfig sysCfg) { + var ctxt = Mockito.mock(CommandContext.class); + Mockito.when(ctxt.systemConfig()).thenReturn(sysCfg); + return ctxt; + } + + private static @NotNull SystemConfig mockSysConfig(boolean isStorageQuataEnforced, long maxFileUploadSizeForStore, DataFile.ChecksumType checksumType, int zipUploadFilesLimit) { + var sysCfg = Mockito.mock(SystemConfig.class); + Mockito.when(sysCfg.isStorageQuotasEnforced()).thenReturn(isStorageQuataEnforced); + Mockito.when(sysCfg.getMaxFileUploadSizeForStore(any())).thenReturn(maxFileUploadSizeForStore); + Mockito.when(sysCfg.getFileFixityChecksumAlgorithm()).thenReturn(checksumType); + Mockito.when(sysCfg.getZipUploadFilesLimit()).thenReturn(zipUploadFilesLimit); + return sysCfg; + } + + private static void mockTmpLookup() { + JvmSettings mockFilesDirectory = Mockito.mock(JvmSettings.class); + Mockito.when(mockFilesDirectory.lookup()).thenReturn("/mocked/path"); + } + + private static @NotNull DatasetVersion mockDatasetVersion() { + var dsVersion = Mockito.mock(DatasetVersion.class); + Mockito.when(dsVersion.getDataset()).thenReturn(Mockito.mock(Dataset.class)); + return dsVersion; + } + +} diff --git a/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java index 58d69da743b..ecf18e6b1ca 100644 --- a/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java @@ -250,9 +250,12 @@ public void testDOIParsing() throws IOException { assertEquals(pid1String, pid3.asString()); assertEquals("dc1", pid3.getProviderId()); - String pid4String = "doi:10.5072/FK3ABCDEF"; + //Also test case insensitive + String pid4String = "doi:10.5072/fk3ABCDEF"; GlobalId pid4 = PidUtil.parseAsGlobalID(pid4String); - assertEquals(pid4String, pid4.asString()); + // Lower case is recognized by converting to upper case internally, so we need to test vs. the upper case identifier + // I.e. we are verifying that the lower case string is parsed the same as the upper case string, both give an internal upper case PID representation + assertEquals("doi:10.5072/FK3ABCDEF", pid4.asString()); assertEquals("dc2", pid4.getProviderId()); String pid5String = "doi:10.5072/FK2ABCDEF"; diff --git a/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java b/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java index 3c5b4797b0a..c4ee4547ed7 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java @@ -63,22 +63,22 @@ private File createBlankFile(String filename) throws IOException { } return Files.createFile(tempFolder.resolve(filename)).toFile(); } - + private FileInputStream createZipReturnFilestream(List file_names, String zipfile_name) throws IOException{ - + File zip_file_obj = this.createAndZipFiles(file_names, zipfile_name); if (zip_file_obj == null){ return null; } - + FileInputStream file_input_stream = new FileInputStream(zip_file_obj); return file_input_stream; - + } - + /* - Convenience class to create .zip file and return a FileInputStream + Convenience method to create .zip file and return a File @param List file_names - List of filenames to add to .zip. These names will be used to create 0 length files @param String zipfile_name - Name of .zip file to create @@ -98,13 +98,13 @@ private File createAndZipFiles(List file_names, String zipfile_name) thr } Path zip_file_obj = this.tempFolder.resolve(zipfile_name); - ZipOutputStream zip_stream = new ZipOutputStream(new FileOutputStream(zip_file_obj.toFile())); + try (ZipOutputStream zip_stream = new ZipOutputStream(new FileOutputStream(zip_file_obj.toFile()))) { - // Iterate through File objects and add them to the ZipOutputStream - for (File file_obj : fileCollection) { - this.addToZipFile(file_obj.getName(), file_obj, zip_stream); + // Iterate through File objects and add them to the ZipOutputStream + for (File file_obj : fileCollection) { + this.addToZipFile(file_obj.getName(), file_obj, zip_stream); + } } - /* ----------------------------------- Cleanup: Delete single files that were added to .zip ----------------------------------- */ @@ -126,7 +126,7 @@ public void testCreateZippedNonShapefile() throws IOException{ File zipfile_obj = createAndZipFiles(file_names, "not-quite-a-shape.zip"); // Pass the .zip to the ShapefileHandler - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(zipfile_obj)); + ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); shp_handler.DEBUG= true; // Contains shapefile? @@ -157,7 +157,7 @@ public void testShapefileWithQpjAndQmd() throws IOException { File zipFile = createAndZipFiles(fileNames, "testShapeWithNewExtensions.zip"); // Pass the zip to the ShapefileHandler - ShapefileHandler shpHandler = new ShapefileHandler(new FileInputStream(zipFile)); + ShapefileHandler shpHandler = new ShapefileHandler(zipFile); shpHandler.DEBUG = true; // Check if it is recognized as a shapefile @@ -191,7 +191,7 @@ public void testZippedTwoShapefiles() throws IOException{ File zipfile_obj = createAndZipFiles(file_names, "two-shapes.zip"); // Pass the .zip to the ShapefileHandler - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(zipfile_obj)); + ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); shp_handler.DEBUG= true; assertTrue(shp_handler.containsShapefile(), "verify shapefile existance"); @@ -217,7 +217,7 @@ public void testZippedTwoShapefiles() throws IOException{ // Rezip/Reorder the files File test_unzip_folder = Files.createDirectory(this.tempFolder.resolve("test_unzip")).toFile(); //File test_unzip_folder = new File("/Users/rmp553/Desktop/blah"); - shp_handler.rezipShapefileSets(new FileInputStream(zipfile_obj), test_unzip_folder ); + shp_handler.rezipShapefileSets(test_unzip_folder ); // Does the re-ordering do what we wanted? @@ -244,7 +244,7 @@ public void testZippedShapefileWithExtraFiles() throws IOException{ File zipfile_obj = createAndZipFiles(file_names, "shape-plus.zip"); // Pass the .zip to the ShapefileHandler - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(zipfile_obj)); + ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); shp_handler.DEBUG= true; assertTrue(shp_handler.containsShapefile(), "verify shapefile existance"); @@ -264,7 +264,7 @@ public void testZippedShapefileWithExtraFiles() throws IOException{ File unzip2Folder = Files.createDirectory(this.tempFolder.resolve("test_unzip2")).toFile(); // Rezip/Reorder the files - shp_handler.rezipShapefileSets(new FileInputStream(zipfile_obj), unzip2Folder); + shp_handler.rezipShapefileSets(unzip2Folder); //shp_handler.rezipShapefileSets(new FileInputStream(zipfile_obj), new File("/Users/rmp553/Desktop/blah")); @@ -284,9 +284,9 @@ public void testZippedShapefileWithExtraFiles() throws IOException{ } @Test - public void testHiddenFiles() { + public void testHiddenFiles() throws IOException { // test with shapefiles in hidden directory - ShapefileHandler shp_handler = new ShapefileHandler("src/test/resources/hiddenShapefiles.zip"); + ShapefileHandler shp_handler = new ShapefileHandler(new File("src/test/resources/hiddenShapefiles.zip")); shp_handler.DEBUG= true; assertFalse(shp_handler.containsShapefile()); } diff --git a/src/test/resources/own-cloud-downloads/greetings.zip b/src/test/resources/own-cloud-downloads/greetings.zip new file mode 100644 index 00000000000..6e166d385d1 Binary files /dev/null and b/src/test/resources/own-cloud-downloads/greetings.zip differ diff --git a/src/test/resources/own-cloud-downloads/shapes.zip b/src/test/resources/own-cloud-downloads/shapes.zip new file mode 100644 index 00000000000..99d5f36c895 Binary files /dev/null and b/src/test/resources/own-cloud-downloads/shapes.zip differ