From d645939baf07d10d0f9f0b5e7b2722556400ff97 Mon Sep 17 00:00:00 2001 From: mohamed eskander Date: Wed, 17 Jan 2024 10:39:18 +0200 Subject: [PATCH 1/2] [DURACOM-143] Fix indexing errors & further improvements --- .../indexobject/IndexFactoryImpl.java | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java index 55c99b168e7a..4b012a072f33 100644 --- a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java +++ b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java @@ -2,7 +2,7 @@ * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at - * + *

* http://www.dspace.org/license/ */ package org.dspace.discovery.indexobject; @@ -64,7 +64,14 @@ public SolrInputDocument buildDocument(Context context, T indexableObject) throw //Do any additional indexing, depends on the plugins for (SolrServiceIndexPlugin solrServiceIndexPlugin : ListUtils.emptyIfNull(solrServiceIndexPlugins)) { - solrServiceIndexPlugin.additionalIndex(context, indexableObject, doc); + try { + solrServiceIndexPlugin.additionalIndex(context, indexableObject, doc); + } catch (Exception e) { + log.error("An error occurred while indexing additional fields. " + + "Could not fully index item with UUID: {}. Plugin: {}", + indexableObject.getUniqueIndexID(), solrServiceIndexPlugin.getClass().getSimpleName()); + + } } return doc; @@ -82,7 +89,7 @@ public void writeDocument(Context context, T indexableObject, SolrInputDocument writeDocument(solrInputDocument, null); } catch (Exception e) { log.error("Error occurred while writing SOLR document for {} object {}", - indexableObject.getType(), indexableObject.getID(), e); + indexableObject.getType(), indexableObject.getID(), e); } } @@ -101,8 +108,8 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea if (streams != null && !streams.isEmpty()) { // limit full text indexing to first 100,000 characters unless configured otherwise final int charLimit = DSpaceServicesFactory.getInstance().getConfigurationService() - .getIntProperty("discovery.solr.fulltext.charLimit", - 100000); + .getIntProperty("discovery.solr.fulltext.charLimit", + 100000); // Use Tika's Text parser as the streams are always from the TEXT bundle (i.e. already extracted text) TextAndCSVParser tikaParser = new TextAndCSVParser(); @@ -113,6 +120,18 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea // Use Apache Tika to parse the full text stream(s) try (InputStream fullTextStreams = streams.getStream()) { tikaParser.parse(fullTextStreams, tikaHandler, tikaMetadata, tikaContext); + + // Write Tika metadata to "tika_meta_*" fields. + // This metadata is not very useful right now, + // but we'll keep it just in case it becomes more useful. + for (String name : tikaMetadata.names()) { + for (String value : tikaMetadata.getValues(name)) { + doc.addField("tika_meta_" + name, value); + } + } + + // Save (parsed) full text to "fulltext" field + doc.addField("fulltext", tikaHandler.toString()); } catch (SAXException saxe) { // Check if this SAXException is just a notice that this file was longer than the character limit. // Unfortunately there is not a unique, public exception type to catch here. This error is thrown @@ -121,30 +140,23 @@ protected void writeDocument(SolrInputDocument doc, FullTextContentStreams strea if (saxe.getMessage().contains("limit has been reached")) { // log that we only indexed up to that configured limit log.info("Full text is larger than the configured limit (discovery.solr.fulltext.charLimit)." - + " Only the first {} characters were indexed.", charLimit); + + " Only the first {} characters were indexed.", charLimit); } else { log.error("Tika parsing error. Could not index full text.", saxe); throw new IOException("Tika parsing error. Could not index full text.", saxe); } - } catch (TikaException ex) { + } catch (TikaException | IOException ex) { log.error("Tika parsing error. Could not index full text.", ex); throw new IOException("Tika parsing error. Could not index full text.", ex); + } finally { + // Add document to index + solr.add(doc); } - - // Write Tika metadata to "tika_meta_*" fields. - // This metadata is not very useful right now, but we'll keep it just in case it becomes more useful. - for (String name : tikaMetadata.names()) { - for (String value : tikaMetadata.getValues(name)) { - doc.addField("tika_meta_" + name, value); - } - } - - // Save (parsed) full text to "fulltext" field - doc.addField("fulltext", tikaHandler.toString()); + return; } - // Add document to index solr.add(doc); + } } From 324d2e3184dd35d2980e599f173adf18a7e31d64 Mon Sep 17 00:00:00 2001 From: mohamed eskander Date: Wed, 17 Jan 2024 10:56:08 +0200 Subject: [PATCH 2/2] [DURACOM-143] Fix license --- .../java/org/dspace/discovery/indexobject/IndexFactoryImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java index 4b012a072f33..f1ae137b9163 100644 --- a/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java +++ b/dspace-api/src/main/java/org/dspace/discovery/indexobject/IndexFactoryImpl.java @@ -2,7 +2,7 @@ * The contents of this file are subject to the license and copyright * detailed in the LICENSE and NOTICE files at the root of the source * tree and available online at - *

+ * * http://www.dspace.org/license/ */ package org.dspace.discovery.indexobject;