Skip to content

Commit

Permalink
Merge pull request DSpace#10152 from DSpace/backport-9893-to-main
Browse files Browse the repository at this point in the history
[Port main] Fix full-text indexing for files over the character limit
  • Loading branch information
tdonohue authored Dec 19, 2024
2 parents 97dd1e0 + fc4cf8f commit 5610412
Showing 1 changed file with 15 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,20 +118,10 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
ParseContext tikaContext = new ParseContext();

// Use Apache Tika to parse the full text stream(s)
boolean extractionSucceeded = false;
try (InputStream fullTextStreams = streams.getStream()) {
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);

// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}

// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
extractionSucceeded = true;
} catch (SAXException saxe) {
// Check if this SAXException is just a notice that this file was longer than the character limit.
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
Expand All @@ -141,18 +131,27 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
// log that we only indexed up to that configured limit
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
+ " Only the first {} characters were indexed.", charLimit);
extractionSucceeded = true;
} else {
log.error("Tika parsing error. Could not index full text.", saxe);
throw new IOException("Tika parsing error. Could not index full text.", saxe);
}
} catch (TikaException | IOException ex) {
log.error("Tika parsing error. Could not index full text.", ex);
throw new IOException("Tika parsing error. Could not index full text.", ex);
} finally {
// Add document to index
solr.add(doc);
}
return;
if (extractionSucceeded) {
// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}
// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
}
}
// Add document to index
solr.add(doc);
Expand Down

0 comments on commit 5610412

Please sign in to comment.