Skip to content

Commit

Permalink
Merge pull request DSpace#9263 from 4Science/DURACOM-143
Browse files Browse the repository at this point in the history
Repository indexing fails when item tika fulltext processing fails with error
  • Loading branch information
tdonohue authored Feb 15, 2024
2 parents f23f00f + 324d2e3 commit fbd3d60
Showing 1 changed file with 30 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,14 @@ public SolrInputDocument buildDocument(Context context, T indexableObject) throw

//Do any additional indexing, depends on the plugins
for (SolrServiceIndexPlugin solrServiceIndexPlugin : ListUtils.emptyIfNull(solrServiceIndexPlugins)) {
solrServiceIndexPlugin.additionalIndex(context, indexableObject, doc);
try {
solrServiceIndexPlugin.additionalIndex(context, indexableObject, doc);
} catch (Exception e) {
log.error("An error occurred while indexing additional fields. " +
"Could not fully index item with UUID: {}. Plugin: {}",
indexableObject.getUniqueIndexID(), solrServiceIndexPlugin.getClass().getSimpleName());

}
}

return doc;
Expand All @@ -82,7 +89,7 @@ public void writeDocument(Context context, T indexableObject, SolrInputDocument
writeDocument(solrInputDocument, null);
} catch (Exception e) {
log.error("Error occurred while writing SOLR document for {} object {}",
indexableObject.getType(), indexableObject.getID(), e);
indexableObject.getType(), indexableObject.getID(), e);
}
}

Expand All @@ -101,8 +108,8 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
if (streams != null && !streams.isEmpty()) {
// limit full text indexing to first 100,000 characters unless configured otherwise
final int charLimit = DSpaceServicesFactory.getInstance().getConfigurationService()
.getIntProperty("discovery.solr.fulltext.charLimit",
100000);
.getIntProperty("discovery.solr.fulltext.charLimit",
100000);

// Use Tika's Text parser as the streams are always from the TEXT bundle (i.e. already extracted text)
TextAndCSVParser tikaParser = new TextAndCSVParser();
Expand All @@ -113,6 +120,18 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
// Use Apache Tika to parse the full text stream(s)
try (InputStream fullTextStreams = streams.getStream()) {
tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext);

// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now,
// but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}

// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
} catch (SAXException saxe) {
// Check if this SAXException is just a notice that this file was longer than the character limit.
// Unfortunately there is not a unique, public exception type to catch here. This error is thrown
Expand All @@ -121,30 +140,23 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea
if (saxe.getMessage().contains("limit has been reached")) {
// log that we only indexed up to that configured limit
log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)."
+ " Only the first {} characters were indexed.", charLimit);
+ " Only the first {} characters were indexed.", charLimit);
} else {
log.error("Tika parsing error. Could not index full text.", saxe);
throw new IOException("Tika parsing error. Could not index full text.", saxe);
}
} catch (TikaException ex) {
} catch (TikaException | IOException ex) {
log.error("Tika parsing error. Could not index full text.", ex);
throw new IOException("Tika parsing error. Could not index full text.", ex);
} finally {
// Add document to index
solr.add(doc);
}

// Write Tika metadata to "tika_meta_*" fields.
// This metadata is not very useful right now, but we'll keep it just in case it becomes more useful.
for (String name : tikaMetadata.names()) {
for (String value : tikaMetadata.getValues(name)) {
doc.addField("tika_meta_" + name, value);
}
}

// Save (parsed) full text to "fulltext" field
doc.addField("fulltext", tikaHandler.toString());
return;
}

// Add document to index
solr.add(doc);

}
}

Expand Down

0 comments on commit fbd3d60

Please sign in to comment.