diff --git a/AUTHORS b/AUTHORS index 540c128..28bd094 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,2 +1,2 @@ -Original author: Mark Triggs . Please feel free +Original author: Mark Triggs . Please feel free to get in touch if you have any queries. diff --git a/README.md b/README.md index a0fa157..67d6684 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ should give you the two required jar files: browse-handler.jar browse-indexing.jar - + 2. Creating your browse indexes -------------------------------- @@ -49,8 +49,8 @@ file with lines of the form: Running it: - java -cp browse-indexing.jar PrintBrowseHeadings /path/to/your/bib/data/index subject-browse authority.index subjects.tmp - java -cp browse-indexing.jar PrintBrowseHeadings /path/to/your/bib/data/index author-browse authority.index names.tmp + java -cp browse-indexing.jar org.vufind.solr.indexing.PrintBrowseHeadings /path/to/your/bib/data/index subject-browse authority.index subjects.tmp + java -cp browse-indexing.jar org.vufind.solr.indexing.PrintBrowseHeadings /path/to/your/bib/data/index author-browse authority.index names.tmp By default this assumes you're using my default field names in your authority index, which are: @@ -78,8 +78,8 @@ The last step is to load all the headings into an SQLite database (which acts as the browse index, effectively). CreateBrowseSQLite does this: - java -cp browse-indexing.jar CreateBrowseSQLite sorted-names.tmp namesbrowse.db - java -cp browse-indexing.jar CreateBrowseSQLite sorted-subjects.tmp subjectsbrowse.db + java -cp browse-indexing.jar org.vufind.solr.indexing.CreateBrowseSQLite sorted-names.tmp namesbrowse.db + java -cp browse-indexing.jar org.vufind.solr.indexing.CreateBrowseSQLite sorted-subjects.tmp subjectsbrowse.db And that's the indexing process. At the end of this you should have @@ -160,3 +160,17 @@ Coding style is One True Brace style. In astyle: astyle --mode=java --style=1tbs -U -H -I -R 'browse-handler/*' 'browse-indexing/*' 'common/*' 'tests/org/*' + +6. Migration from earlier releases +----------------------------------- + +Versions of the browse handler included in VuFind® 9.x and earlier +used different names for certain components. The table below +summarizes the changes: + + | Former name | Current name | + | ------------------------------- | -------------------------------------------- | + | BIBLEECH (environment variable) | BIB_FIELD_ITERATOR | + | bibleech (Java System property) | bib_field_iterator | + | PrintBrowseHeadings (Class) | org.vufind.solr.indexing.PrintBrowseHeadings | + | CreateBrowseSQLite (Class) | org.vufind.solr.indexing.CreateBrowseSQLite | diff --git a/browse-indexing/Predicate.java b/browse-indexing/Predicate.java deleted file mode 100644 index 65e55f0..0000000 --- a/browse-indexing/Predicate.java +++ /dev/null @@ -1,9 +0,0 @@ -// -// Author: Mark Triggs -// - - -public interface Predicate -{ - boolean isSatisfiedBy(Object obj); -} diff --git a/browse-indexing/PrintBrowseHeadings.java b/browse-indexing/PrintBrowseHeadings.java deleted file mode 100644 index 23f8eee..0000000 --- a/browse-indexing/PrintBrowseHeadings.java +++ /dev/null @@ -1,230 +0,0 @@ -// -// Author: Mark Triggs -// - -import java.io.*; -import java.nio.charset.*; - -import org.apache.lucene.store.*; -import org.apache.lucene.search.*; -import org.apache.lucene.index.*; -import org.apache.lucene.document.*; - -import org.vufind.util.BrowseEntry; - -// Note that this version is coming from Solr! -import org.apache.commons.codec.binary.Base64; - - -public class PrintBrowseHeadings -{ - private Leech bibLeech; - private Leech nonprefAuthLeech; - - IndexSearcher bibSearcher; - IndexSearcher authSearcher; - - private String luceneField; - - private String KEY_SEPARATOR = "\1"; - private String RECORD_SEPARATOR = "\r\n"; - - /** - * Load headings from the index into a file. - * - * @param leech Leech for pulling in headings - * @param out Output target - * @param predicate Optional Predicate for filtering headings - */ - private void loadHeadings(Leech leech, - PrintWriter out, - Predicate predicate) - throws Exception - { - BrowseEntry h; - while ((h = leech.next()) != null) { - // We use a byte array for the sort key instead of a string to ensure - // consistent sorting even if the index tool and browse handler are running - // with different locale settings. Using strings results in less predictable - // behavior. - byte[] sort_key = h.key; - String key_text = h.key_text; - String heading = h.value; - - if (predicate != null && - !predicate.isSatisfiedBy(heading)) { - continue; - } - - if (sort_key != null) { - // Output a delimited key/value pair, base64-encoding both strings - // to ensure that no characters overlap with the delimiter or introduce - // \n's that could interfere with line-based sorting of the file. - out.print(new String(Base64.encodeBase64(sort_key)) + - KEY_SEPARATOR + - new String(Base64.encodeBase64(key_text.getBytes(Charset.forName("UTF-8")))) + - KEY_SEPARATOR + - new String(Base64.encodeBase64(heading.getBytes(Charset.forName("UTF-8")))) + - RECORD_SEPARATOR); - } - } - } - - - private int bibCount(String heading) throws IOException - { - TotalHitCountCollector counter = new TotalHitCountCollector(); - - bibSearcher.search(new ConstantScoreQuery(new TermQuery(new Term(luceneField, heading))), - counter); - - return counter.getTotalHits(); - } - - - private boolean isLinkedFromBibData(String heading) - throws IOException - { - TopDocs hits = null; - - int max_headings = 20; - while (true) { - hits = authSearcher.search - (new ConstantScoreQuery - (new TermQuery - (new Term - (System.getProperty("field.insteadof", "insteadOf"), - heading))), - max_headings); - - if (hits.scoreDocs.length < max_headings) { - // That's all of them. All done. - break; - } else { - // Hm. That's a lot of headings. Go back for more. - max_headings *= 2; - } - } - - StoredFields storedFields = authSearcher.getIndexReader().storedFields(); - for (int i = 0; i < hits.scoreDocs.length; i++) { - Document doc = storedFields.document(hits.scoreDocs[i].doc); - - String[] preferred = doc.getValues(System.getProperty("field.preferred", "preferred")); - if (preferred.length > 0) { - String preferredHeading = preferred[0]; - - if (bibCount(preferredHeading) > 0) { - return true; - } - } else { - return false; - } - } - - return false; - } - - - private String getEnvironment(String var) - { - return (System.getenv(var) != null) ? - System.getenv(var) : System.getProperty(var.toLowerCase()); - } - - - private Leech getBibLeech(String bibPath, String luceneField) - throws Exception - { - String leechClass = "Leech"; - - if (getEnvironment("BIBLEECH") != null) { - leechClass = getEnvironment("BIBLEECH"); - } - - return (Leech)(Class.forName(leechClass) - .getConstructor(String.class, String.class) - .newInstance(bibPath, luceneField)); - } - - - public void create(String bibPath, - String luceneField, - String authPath, - String outFile) - throws Exception - { - bibLeech = getBibLeech(bibPath, luceneField); - this.luceneField = luceneField; - - IndexReader bibReader = DirectoryReader.open(FSDirectory.open(new File(bibPath).toPath())); - bibSearcher = new IndexSearcher(bibReader); - - PrintWriter out = new PrintWriter(new FileWriter(outFile)); - - if (authPath != null) { - try { - nonprefAuthLeech = new Leech(authPath, - System.getProperty("field.insteadof", - "insteadOf")); - } catch (IndexNotFoundException e) { - // If no data has been written to the index yet, this exception - // might get thrown; in that case, we should skip loading authority - // data rather than breaking the whole indexing process. - nonprefAuthLeech = null; - } - - if (nonprefAuthLeech != null) { - IndexReader authReader = DirectoryReader.open(FSDirectory.open(new File(authPath).toPath())); - authSearcher = new IndexSearcher(authReader); - - loadHeadings(nonprefAuthLeech, out, - new Predicate() { - public boolean isSatisfiedBy(Object obj) { - String heading = (String) obj; - - try { - return isLinkedFromBibData(heading); - } catch (IOException e) { - return true; - } - } - } - ); - - nonprefAuthLeech.dropOff(); - } - } - - loadHeadings(bibLeech, out, null); - - bibLeech.dropOff(); - - out.close(); - } - - - public static void main(String args[]) - throws Exception - { - if (args.length != 3 && args.length != 4) { - System.err.println - ("Usage: PrintBrowseHeadings " - + " "); - System.err.println("\nor:\n"); - System.err.println - ("Usage: PrintBrowseHeadings " - + " "); - - System.exit(0); - } - - PrintBrowseHeadings self = new PrintBrowseHeadings(); - - if (args.length == 4) { - self.create(args[0], args[1], args[2], args[3]); - } else { - self.create(args[0], args[1], null, args[2]); - } - } -} diff --git a/build.xml b/build.xml index 6110a0b..f6da9e3 100644 --- a/build.xml +++ b/build.xml @@ -18,13 +18,10 @@ - + - - - @@ -64,7 +61,6 @@ - @@ -77,73 +73,38 @@ - + - - - - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - - + + + + - - - - @@ -198,18 +159,24 @@ + + + + + + + - + description="Run tests, assumes build is current and test cores are set up" + depends="build-tests"> - - - @@ -220,7 +187,7 @@ - + diff --git a/common/java/org/vufind/util/Utils.java b/common/java/org/vufind/util/Utils.java deleted file mode 100644 index c583525..0000000 --- a/common/java/org/vufind/util/Utils.java +++ /dev/null @@ -1,10 +0,0 @@ -package org.vufind.util; - -public class Utils -{ - public static String getEnvironment(String var) - { - return (System.getenv(var) != null) ? - System.getenv(var) : System.getProperty(var.toLowerCase()); - } -} diff --git a/libs/clojure-1.4.0.jar b/libs/clojure-1.4.0.jar deleted file mode 100644 index b8c1b5a..0000000 Binary files a/libs/clojure-1.4.0.jar and /dev/null differ diff --git a/src/main/java/compat/CreateBrowseSQLite.java b/src/main/java/compat/CreateBrowseSQLite.java new file mode 100644 index 0000000..ccf8d3e --- /dev/null +++ b/src/main/java/compat/CreateBrowseSQLite.java @@ -0,0 +1,13 @@ +import org.vufind.util.Utils; + +public class CreateBrowseSQLite +{ + public static void main(String args[]) throws Exception + { + Utils.printDeprecationWarning("You are using the 'CreateBrowseSQLite' class.", + "This still works, but it has been renamed to 'org.vufind.solr.indexing.CreateBrowseSQLite'", + "You should switch to avoid breakage in future versions."); + + org.vufind.solr.indexing.CreateBrowseSQLite.main(args); + } +} diff --git a/src/main/java/compat/PrintBrowseHeadings.java b/src/main/java/compat/PrintBrowseHeadings.java new file mode 100644 index 0000000..7c08cde --- /dev/null +++ b/src/main/java/compat/PrintBrowseHeadings.java @@ -0,0 +1,13 @@ +import org.vufind.util.Utils; + +public class PrintBrowseHeadings +{ + public static void main(String args[]) throws Exception + { + Utils.printDeprecationWarning("You are using the 'PrintBrowseHeadings' class.", + "This still works, but it has been renamed to 'org.vufind.solr.indexing.PrintBrowseHeadings'", + "You should switch to avoid breakage in future versions."); + + org.vufind.solr.indexing.PrintBrowseHeadings.main(args); + } +} diff --git a/browse-handler/java/org/vufind/solr/handler/AuthDB.java b/src/main/java/org/vufind/solr/handler/AuthDB.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/AuthDB.java rename to src/main/java/org/vufind/solr/handler/AuthDB.java diff --git a/browse-handler/java/org/vufind/solr/handler/BibDB.java b/src/main/java/org/vufind/solr/handler/BibDB.java similarity index 99% rename from browse-handler/java/org/vufind/solr/handler/BibDB.java rename to src/main/java/org/vufind/solr/handler/BibDB.java index 038f9da..96b3059 100644 --- a/browse-handler/java/org/vufind/solr/handler/BibDB.java +++ b/src/main/java/org/vufind/solr/handler/BibDB.java @@ -8,6 +8,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; + import org.apache.lucene.document.Document; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; diff --git a/browse-handler/java/org/vufind/solr/handler/Browse.java b/src/main/java/org/vufind/solr/handler/Browse.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/Browse.java rename to src/main/java/org/vufind/solr/handler/Browse.java diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseItem.java b/src/main/java/org/vufind/solr/handler/BrowseItem.java similarity index 98% rename from browse-handler/java/org/vufind/solr/handler/BrowseItem.java rename to src/main/java/org/vufind/solr/handler/BrowseItem.java index 302269d..7f69e94 100644 --- a/browse-handler/java/org/vufind/solr/handler/BrowseItem.java +++ b/src/main/java/org/vufind/solr/handler/BrowseItem.java @@ -6,6 +6,8 @@ import java.util.List; import java.util.Map; +import org.vufind.solr.handler.client.solrj.BrowseResponse; + /** * Container class for data in a single browse entry. diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseList.java b/src/main/java/org/vufind/solr/handler/BrowseList.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/BrowseList.java rename to src/main/java/org/vufind/solr/handler/BrowseList.java diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseRequestHandler.java b/src/main/java/org/vufind/solr/handler/BrowseRequestHandler.java similarity index 99% rename from browse-handler/java/org/vufind/solr/handler/BrowseRequestHandler.java rename to src/main/java/org/vufind/solr/handler/BrowseRequestHandler.java index 3a4f6cd..fcccefa 100644 --- a/browse-handler/java/org/vufind/solr/handler/BrowseRequestHandler.java +++ b/src/main/java/org/vufind/solr/handler/BrowseRequestHandler.java @@ -1,5 +1,5 @@ // -// Author: Mark Triggs +// Author: Mark Triggs // diff --git a/browse-handler/java/org/vufind/solr/handler/BrowseSource.java b/src/main/java/org/vufind/solr/handler/BrowseSource.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/BrowseSource.java rename to src/main/java/org/vufind/solr/handler/BrowseSource.java diff --git a/browse-handler/java/org/vufind/solr/handler/HeadingSlice.java b/src/main/java/org/vufind/solr/handler/HeadingSlice.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/HeadingSlice.java rename to src/main/java/org/vufind/solr/handler/HeadingSlice.java diff --git a/browse-handler/java/org/vufind/solr/handler/HeadingsDB.java b/src/main/java/org/vufind/solr/handler/HeadingsDB.java similarity index 78% rename from browse-handler/java/org/vufind/solr/handler/HeadingsDB.java rename to src/main/java/org/vufind/solr/handler/HeadingsDB.java index 9061c54..c7bd265 100644 --- a/browse-handler/java/org/vufind/solr/handler/HeadingsDB.java +++ b/src/main/java/org/vufind/solr/handler/HeadingsDB.java @@ -60,16 +60,11 @@ private void openDB() throws Exception db.setAutoCommit(false); dbVersion = currentVersion(); - PreparedStatement countStmnt = db.prepareStatement( - "select count(1) as count from headings"); - - ResultSet rs = countStmnt.executeQuery(); - rs.next(); - - totalCount = rs.getInt("count"); - - rs.close(); - countStmnt.close(); + try (PreparedStatement countStmnt = db.prepareStatement("select count(1) as count from headings"); + ResultSet rs = countStmnt.executeQuery()) { + rs.next(); + totalCount = rs.getInt("count"); + } } @@ -134,42 +129,44 @@ public synchronized HeadingSlice getHeadings(int rowid, { HeadingSlice result = new HeadingSlice(); - PreparedStatement rowStmnt = db.prepareStatement( + try (PreparedStatement rowStmnt = db.prepareStatement( String.format("select * from headings " + "where rowid >= ? " + "order by rowid " + "limit %d ", rows) - ); - - rowStmnt.setInt(1, rowid); - - ResultSet rs = null; + )) { + rowStmnt.setInt(1, rowid); + + ResultSet rs = null; + + for (int attempt = 0; attempt < 3; attempt++) { + try { + rs = rowStmnt.executeQuery(); + break; + } catch (SQLException e) { + Log.info("Retry number " + attempt + "..."); + Thread.sleep(50); + } + } - for (int attempt = 0; attempt < 3; attempt++) { - try { - rs = rowStmnt.executeQuery(); - break; - } catch (SQLException e) { - Log.info("Retry number " + attempt + "..."); - Thread.sleep(50); + if (rs == null) { + return result; } - } - if (rs == null) { - return result; - } + try { + while (rs.next()) { + result.sort_keys.add(rs.getString("key_text")); + result.headings.add(rs.getString("heading")); + } - while (rs.next()) { - result.sort_keys.add(rs.getString("key_text")); - result.headings.add(rs.getString("heading")); + } finally { + rs.close(); + } } - rs.close(); - rowStmnt.close(); - result.total = Math.max(0, (totalCount - rowid) + 1); return result; } -} \ No newline at end of file +} diff --git a/browse-handler/java/org/vufind/solr/handler/Log.java b/src/main/java/org/vufind/solr/handler/Log.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/Log.java rename to src/main/java/org/vufind/solr/handler/Log.java diff --git a/browse-handler/java/org/vufind/solr/handler/MatchTypeResponse.java b/src/main/java/org/vufind/solr/handler/MatchTypeResponse.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/MatchTypeResponse.java rename to src/main/java/org/vufind/solr/handler/MatchTypeResponse.java diff --git a/browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java b/src/main/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java similarity index 100% rename from browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java rename to src/main/java/org/vufind/solr/handler/client/solrj/BrowseRequest.java diff --git a/browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java b/src/main/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java similarity index 96% rename from browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java rename to src/main/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java index 4a84cfd..3f1bbf5 100644 --- a/browse-handler/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java +++ b/src/main/java/org/vufind/solr/handler/client/solrj/BrowseResponse.java @@ -3,10 +3,9 @@ import java.util.Map; import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.SolrResponseBase; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.util.NamedList; -import org.vufind.solr.handler.MatchTypeResponse.MatchType; /** * Holds the response from BrowseRequest. diff --git a/browse-indexing/CreateBrowseSQLite.java b/src/main/java/org/vufind/solr/indexing/CreateBrowseSQLite.java similarity index 51% rename from browse-indexing/CreateBrowseSQLite.java rename to src/main/java/org/vufind/solr/indexing/CreateBrowseSQLite.java index ff8f270..eb67f72 100644 --- a/browse-indexing/CreateBrowseSQLite.java +++ b/src/main/java/org/vufind/solr/indexing/CreateBrowseSQLite.java @@ -1,10 +1,15 @@ +package org.vufind.solr.indexing; + // -// Author: Mark Triggs +// Author: Mark Triggs // - -import java.io.*; - -import java.sql.*; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.Statement; // Note that this version is coming from Solr! import org.apache.commons.codec.binary.Base64; @@ -60,34 +65,32 @@ private void loadHeadings(BufferedReader br) outputDB.setAutoCommit(false); - PreparedStatement prep = outputDB.prepareStatement( - "insert or ignore into all_headings (key, key_text, heading) values (?, ?, ?)"); + try (PreparedStatement prep = outputDB.prepareStatement("insert or ignore into all_headings (key, key_text, heading) values (?, ?, ?)")) { + String line; + while ((line = readCRLFLine(br)) != null) { + String[] fields = line.split(KEY_SEPARATOR); - String line; - while ((line = readCRLFLine(br)) != null) { - String[] fields = line.split(KEY_SEPARATOR); + if (fields.length == 3) { + // If we found the separator character, we have a key/value pair of + // Base64-encoded strings to decode and push into the batch: + prep.setBytes(1, Base64.decodeBase64(fields[0].getBytes())); + prep.setBytes(2, Base64.decodeBase64(fields[1].getBytes())); + prep.setBytes(3, Base64.decodeBase64(fields[2].getBytes())); - if (fields.length == 3) { - // If we found the separator character, we have a key/value pair of - // Base64-encoded strings to decode and push into the batch: - prep.setBytes(1, Base64.decodeBase64(fields[0].getBytes())); - prep.setBytes(2, Base64.decodeBase64(fields[1].getBytes())); - prep.setBytes(3, Base64.decodeBase64(fields[2].getBytes())); + prep.addBatch(); + } - prep.addBatch(); - } + if ((count % 500000) == 0) { + prep.executeBatch(); + prep.clearBatch(); + } - if ((count % 500000) == 0) { - prep.executeBatch(); - prep.clearBatch(); + count++; } - count++; + prep.executeBatch(); } - prep.executeBatch(); - prep.close(); - outputDB.commit(); outputDB.setAutoCommit(true); } @@ -96,29 +99,26 @@ private void loadHeadings(BufferedReader br) private void setupDatabase() throws Exception { - Statement stat = outputDB.createStatement(); - - stat.executeUpdate("drop table if exists all_headings;"); - stat.executeUpdate("create table all_headings (key, key_text, heading);"); - stat.executeUpdate("PRAGMA synchronous = OFF;"); - stat.execute("PRAGMA journal_mode = OFF;"); - - stat.close(); + try (Statement stat = outputDB.createStatement()) { + stat.executeUpdate("drop table if exists all_headings;"); + stat.executeUpdate("create table all_headings (key, key_text, heading);"); + stat.executeUpdate("PRAGMA synchronous = OFF;"); + stat.execute("PRAGMA journal_mode = OFF;"); + } } private void buildOrderedTables() throws Exception { - Statement stat = outputDB.createStatement(); + try (Statement stat = outputDB.createStatement()) { - stat.executeUpdate("drop table if exists headings;"); - stat.executeUpdate("create table headings " + - "as select * from all_headings order by key;"); + stat.executeUpdate("drop table if exists headings;"); + stat.executeUpdate("create table headings " + + "as select * from all_headings order by key;"); - stat.executeUpdate("create index keyindex on headings (key);"); - - stat.close(); + stat.executeUpdate("create index keyindex on headings (key);"); + } } @@ -130,12 +130,9 @@ public void create(String headingsFile, String outputPath) setupDatabase(); - BufferedReader br = new BufferedReader - (new FileReader(headingsFile)); - - loadHeadings(br); - - br.close(); + try (BufferedReader br = new BufferedReader(new FileReader(headingsFile))) { + loadHeadings(br); + } buildOrderedTables(); } diff --git a/src/main/java/org/vufind/solr/indexing/Predicate.java b/src/main/java/org/vufind/solr/indexing/Predicate.java new file mode 100644 index 0000000..602bb5e --- /dev/null +++ b/src/main/java/org/vufind/solr/indexing/Predicate.java @@ -0,0 +1,11 @@ +package org.vufind.solr.indexing; + +// +// Author: Mark Triggs +// + + +public interface Predicate +{ + boolean isSatisfiedBy(Object obj); +} diff --git a/src/main/java/org/vufind/solr/indexing/PrintBrowseHeadings.java b/src/main/java/org/vufind/solr/indexing/PrintBrowseHeadings.java new file mode 100644 index 0000000..da59bc7 --- /dev/null +++ b/src/main/java/org/vufind/solr/indexing/PrintBrowseHeadings.java @@ -0,0 +1,254 @@ +package org.vufind.solr.indexing; + +// +// Author: Mark Triggs +// +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.Charset; + +// Note that this version is coming from Solr! +import org.apache.commons.codec.binary.Base64; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexNotFoundException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.store.FSDirectory; +import org.vufind.util.BrowseEntry; +import org.vufind.util.Utils; + + +public class PrintBrowseHeadings +{ + private SolrFieldIterator nonprefAuthFieldIterator; + + IndexSearcher bibSearcher; + IndexSearcher authSearcher; + + private String luceneField; + + private String KEY_SEPARATOR = "\1"; + private String RECORD_SEPARATOR = "\r\n"; + + /** + * Load headings from the index into a file. + * + * @param fieldIterator SolrFieldIterator source for headings + * @param out Output target + * @param predicate Optional Predicate for filtering headings + */ + private void loadHeadings(SolrFieldIterator fieldIterator, + PrintWriter out, + Predicate predicate) + throws Exception + { + for (BrowseEntry h : fieldIterator) { + // We use a byte array for the sort key instead of a string to ensure + // consistent sorting even if the index tool and browse handler are running + // with different locale settings. Using strings results in less predictable + // behavior. + byte[] sort_key = h.key; + String key_text = h.key_text; + String heading = h.value; + + if (predicate != null && + !predicate.isSatisfiedBy(heading)) { + continue; + } + + if (sort_key != null) { + // Output a delimited key/value pair, base64-encoding both strings + // to ensure that no characters overlap with the delimiter or introduce + // \n's that could interfere with line-based sorting of the file. + out.print(new String(Base64.encodeBase64(sort_key)) + + KEY_SEPARATOR + + new String(Base64.encodeBase64(key_text.getBytes(Charset.forName("UTF-8")))) + + KEY_SEPARATOR + + new String(Base64.encodeBase64(heading.getBytes(Charset.forName("UTF-8")))) + + RECORD_SEPARATOR); + } + } + } + + + private int bibCount(String heading) throws IOException + { + TotalHitCountCollector counter = new TotalHitCountCollector(); + + bibSearcher.search(new ConstantScoreQuery(new TermQuery(new Term(luceneField, heading))), + counter); + + return counter.getTotalHits(); + } + + + private boolean isLinkedFromBibData(String heading) + throws IOException + { + TopDocs hits = null; + + int max_headings = 20; + while (true) { + hits = authSearcher.search + (new ConstantScoreQuery + (new TermQuery + (new Term + (System.getProperty("field.insteadof", "insteadOf"), + heading))), + max_headings); + + if (hits.scoreDocs.length < max_headings) { + // That's all of them. All done. + break; + } else { + // Hm. That's a lot of headings. Go back for more. + max_headings *= 2; + } + } + + StoredFields storedFields = authSearcher.getIndexReader().storedFields(); + for (int i = 0; i < hits.scoreDocs.length; i++) { + Document doc = storedFields.document(hits.scoreDocs[i].doc); + + String[] preferred = doc.getValues(System.getProperty("field.preferred", "preferred")); + if (preferred.length > 0) { + String preferredHeading = preferred[0]; + + if (bibCount(preferredHeading) > 0) { + return true; + } + } else { + return false; + } + } + + return false; + } + + + private SolrFieldIterator getBibIterator(String bibPath, String luceneField) + throws Exception + { + String fieldIteratorClass = "org.vufind.solr.indexing.SolrFieldIterator"; + + if (Utils.getEnvironment("BIBLEECH") != null) { + if (System.getenv("BIBLEECH") != null) { + Utils.printDeprecationWarning("You are using the 'BIBLEECH' environment variable.", + "This still works, but it has been renamed to 'BIB_FIELD_ITERATOR'", + "You should switch to avoid breakage in future versions."); + } + + if (System.getProperty("bibleech") != null) { + Utils.printDeprecationWarning("You are using the 'bibleech' system property.", + "This still works, but it has been renamed to 'bib_field_iterator'", + "You should switch to avoid breakage in future versions."); + + } + + fieldIteratorClass = Utils.getEnvironment("BIBLEECH"); + } + + + if (Utils.getEnvironment("BIB_FIELD_ITERATOR") != null) { + fieldIteratorClass = Utils.getEnvironment("BIB_FIELD_ITERATOR"); + } + + if ("StoredFieldLeech".equals(fieldIteratorClass)) { + Utils.printDeprecationWarning("You are using the 'StoredFieldLeech' class.", + "This still works, but it has been renamed to 'org.vufind.solr.indexing.StoredFieldIterator'", + "You should switch to avoid breakage in future versions."); + fieldIteratorClass = "org.vufind.solr.indexing.StoredFieldIterator"; + } + + return (SolrFieldIterator)(Class.forName(fieldIteratorClass) + .getConstructor(String.class, String.class) + .newInstance(bibPath, luceneField)); + } + + + public void create(String bibPath, + String luceneField, + String authPath, + String outFile) + throws Exception + { + try (SolrFieldIterator bibFieldIterator = getBibIterator(bibPath, luceneField)) { + this.luceneField = luceneField; + + IndexReader bibReader = DirectoryReader.open(FSDirectory.open(new File(bibPath).toPath())); + bibSearcher = new IndexSearcher(bibReader); + + try (PrintWriter out = new PrintWriter(new FileWriter(outFile))) { + if (authPath != null) { + try { + nonprefAuthFieldIterator = new SolrFieldIterator(authPath, + System.getProperty("field.insteadof", + "insteadOf")); + } catch (IndexNotFoundException e) { + // If no data has been written to the index yet, this exception + // might get thrown; in that case, we should skip loading authority + // data rather than breaking the whole indexing process. + nonprefAuthFieldIterator = null; + } + + if (nonprefAuthFieldIterator != null) { + IndexReader authReader = DirectoryReader.open(FSDirectory.open(new File(authPath).toPath())); + authSearcher = new IndexSearcher(authReader); + + loadHeadings(nonprefAuthFieldIterator, out, + new Predicate() { + public boolean isSatisfiedBy(Object obj) { + String heading = (String) obj; + + try { + return isLinkedFromBibData(heading); + } catch (IOException e) { + return true; + } + } + } + ); + + nonprefAuthFieldIterator.close(); + } + } + + loadHeadings(bibFieldIterator, out, null); + } + } + } + + + public static void main(String args[]) + throws Exception + { + if (args.length != 3 && args.length != 4) { + System.err.println + ("Usage: PrintBrowseHeadings " + + " "); + System.err.println("\nor:\n"); + System.err.println + ("Usage: PrintBrowseHeadings " + + " "); + + System.exit(0); + } + + PrintBrowseHeadings self = new PrintBrowseHeadings(); + + if (args.length == 4) { + self.create(args[0], args[1], args[2], args[3]); + } else { + self.create(args[0], args[1], null, args[2]); + } + } +} diff --git a/browse-indexing/Leech.java b/src/main/java/org/vufind/solr/indexing/SolrFieldIterator.java similarity index 60% rename from browse-indexing/Leech.java rename to src/main/java/org/vufind/solr/indexing/SolrFieldIterator.java index bea0826..c0889c7 100644 --- a/browse-indexing/Leech.java +++ b/src/main/java/org/vufind/solr/indexing/SolrFieldIterator.java @@ -1,17 +1,30 @@ -import org.apache.lucene.store.*; -import org.apache.lucene.index.*; -import org.apache.lucene.search.*; -import java.io.*; -import java.util.*; - +package org.vufind.solr.indexing; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +import org.apache.lucene.index.CompositeReader; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; - import org.vufind.util.BrowseEntry; import org.vufind.util.Normalizer; import org.vufind.util.NormalizerFactory; -public class Leech +public class SolrFieldIterator implements AutoCloseable, Iterator, Iterable { protected CompositeReader reader; protected IndexSearcher searcher; @@ -23,9 +36,10 @@ public class Leech TermsEnum tenum = null; + private BrowseEntry nextEntry = null; + private boolean exhausted = false; - public Leech(String indexPath, - String field) throws Exception + public SolrFieldIterator(String indexPath, String field) throws Exception { // Open our composite reader (a top-level DirectoryReader that // contains one reader per segment in our index). @@ -52,7 +66,7 @@ public byte[] buildSortKey(String heading) } - public void dropOff() throws IOException + public void close() throws IOException { reader.close(); } @@ -73,7 +87,7 @@ private boolean termExists(String t) // // If there's no currently selected TermEnum, create one from the reader. // - public BrowseEntry next() throws Exception + protected BrowseEntry readNext() throws IOException { for (;;) { if (tenum == null) { @@ -113,4 +127,52 @@ public BrowseEntry next() throws Exception // Try the next term } } + + + public void tryReadNext() { + if (nextEntry != null) { + // Already have one + return; + } + + if (exhausted) { + // Nothing more to read + } + + try { + nextEntry = readNext(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + if (nextEntry == null) { + exhausted = true; + } + } + + @Override + public BrowseEntry next() { + tryReadNext(); + + if (nextEntry == null) { + throw new NoSuchElementException(); + } + + BrowseEntry result = nextEntry; + nextEntry = null; + + return result; + } + + @Override + public boolean hasNext() { + tryReadNext(); + + return nextEntry != null; + } + + @Override + public Iterator iterator() { + return this; + } } diff --git a/browse-indexing/StoredFieldLeech.java b/src/main/java/org/vufind/solr/indexing/StoredFieldIterator.java similarity index 80% rename from browse-indexing/StoredFieldLeech.java rename to src/main/java/org/vufind/solr/indexing/StoredFieldIterator.java index 0bdf556..ca3ba0e 100644 --- a/browse-indexing/StoredFieldLeech.java +++ b/src/main/java/org/vufind/solr/indexing/StoredFieldIterator.java @@ -1,17 +1,24 @@ +package org.vufind.solr.indexing; + // Build a browse list by walking the docs in an index and extracting sort key // and values from a pair of stored fields. - -import java.io.*; -import java.util.*; -import org.apache.lucene.store.*; -import org.apache.lucene.index.*; -import org.apache.lucene.document.*; +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Set; + +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.MultiBits; +import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Bits; - -import org.vufind.util.Utils; import org.vufind.util.BrowseEntry; +import org.vufind.util.Utils; -public class StoredFieldLeech extends Leech +public class StoredFieldIterator extends SolrFieldIterator { int currentDoc = 0; LinkedList buffer; @@ -23,7 +30,7 @@ public class StoredFieldLeech extends Leech private Bits liveDocsBitSet; - public StoredFieldLeech(String indexPath, String field) throws Exception + public StoredFieldIterator(String indexPath, String field) throws Exception { super(indexPath, field); @@ -50,8 +57,7 @@ public StoredFieldLeech(String indexPath, String field) throws Exception } - private void loadDocument(IndexReader reader, int docid) - throws Exception + private void loadDocument(IndexReader reader, int docid) throws IOException { Document doc = reader.storedFields().document(currentDoc, fieldSelection); @@ -82,7 +88,7 @@ private void loadDocument(IndexReader reader, int docid) } - public BrowseEntry next() throws Exception + protected BrowseEntry readNext() throws IOException { while (buffer.isEmpty()) { if (currentDoc < reader.maxDoc()) { diff --git a/common/java/org/vufind/util/BrowseEntry.java b/src/main/java/org/vufind/util/BrowseEntry.java similarity index 100% rename from common/java/org/vufind/util/BrowseEntry.java rename to src/main/java/org/vufind/util/BrowseEntry.java diff --git a/common/java/org/vufind/util/DeweyCallNormalizer.java b/src/main/java/org/vufind/util/DeweyCallNormalizer.java similarity index 100% rename from common/java/org/vufind/util/DeweyCallNormalizer.java rename to src/main/java/org/vufind/util/DeweyCallNormalizer.java diff --git a/common/java/org/vufind/util/ICUCollatorNormalizer.java b/src/main/java/org/vufind/util/ICUCollatorNormalizer.java similarity index 95% rename from common/java/org/vufind/util/ICUCollatorNormalizer.java rename to src/main/java/org/vufind/util/ICUCollatorNormalizer.java index 3aa933a..e074764 100644 --- a/common/java/org/vufind/util/ICUCollatorNormalizer.java +++ b/src/main/java/org/vufind/util/ICUCollatorNormalizer.java @@ -1,16 +1,15 @@ package org.vufind.util; -import java.util.regex.*; - import com.ibm.icu.text.CollationKey; import com.ibm.icu.text.Collator; +import java.util.regex.Pattern; /** * Normalizer class which uses the ICU Collator class to produce collation byte arrays. * The use of Collator takes into account diacritics and other Unicode features. * This normalizer should be suitable for most text fields. * - * @author Mark Triggs + * @author Mark Triggs * @author Tod Olson * */ diff --git a/common/java/org/vufind/util/LCCallNormalizer.java b/src/main/java/org/vufind/util/LCCallNormalizer.java similarity index 100% rename from common/java/org/vufind/util/LCCallNormalizer.java rename to src/main/java/org/vufind/util/LCCallNormalizer.java diff --git a/common/java/org/vufind/util/NACONormalizer.java b/src/main/java/org/vufind/util/NACONormalizer.java similarity index 100% rename from common/java/org/vufind/util/NACONormalizer.java rename to src/main/java/org/vufind/util/NACONormalizer.java diff --git a/common/java/org/vufind/util/Normalizer.java b/src/main/java/org/vufind/util/Normalizer.java similarity index 100% rename from common/java/org/vufind/util/Normalizer.java rename to src/main/java/org/vufind/util/Normalizer.java diff --git a/common/java/org/vufind/util/NormalizerFactory.java b/src/main/java/org/vufind/util/NormalizerFactory.java similarity index 100% rename from common/java/org/vufind/util/NormalizerFactory.java rename to src/main/java/org/vufind/util/NormalizerFactory.java diff --git a/common/java/org/vufind/util/TitleNormalizer.java b/src/main/java/org/vufind/util/TitleNormalizer.java similarity index 100% rename from common/java/org/vufind/util/TitleNormalizer.java rename to src/main/java/org/vufind/util/TitleNormalizer.java diff --git a/src/main/java/org/vufind/util/Utils.java b/src/main/java/org/vufind/util/Utils.java new file mode 100644 index 0000000..931a5c4 --- /dev/null +++ b/src/main/java/org/vufind/util/Utils.java @@ -0,0 +1,28 @@ +package org.vufind.util; + +import java.util.Arrays; +import java.util.Locale; + +public class Utils +{ + public static String getEnvironment(String var) + { + return (System.getenv(var) != null) ? + System.getenv(var) : System.getProperty(var.toLowerCase(Locale.ROOT)); + } + + public static void printDeprecationWarning(String ... lines) { + int maxLineLength = Arrays.stream(lines).map(String::length).max(Integer::compare).orElse(70); + + String separator = new String(new char[maxLineLength]).replace('\0', '*'); + + System.err.print("\n\n\n"); + System.err.println(separator); + System.err.println("DEPRECATION WARNING:\n"); + for (String line : lines) { + System.err.println(line); + } + System.err.println(separator); + System.err.print("\n\n\n"); + } +} diff --git a/tests/org/vufind/solr/handler/BrowseItemTest.java b/tests/org/vufind/solr/handler/BrowseItemTest.java index 73cb164..db56e58 100644 --- a/tests/org/vufind/solr/handler/BrowseItemTest.java +++ b/tests/org/vufind/solr/handler/BrowseItemTest.java @@ -126,13 +126,13 @@ public void testSetCountInt() int count = 37; BrowseItem item = new BrowseItem("", ""); item.setCount(count); - assertEquals(new Integer(count), item.get("count")); + assertEquals(Integer.valueOf(count), item.get("count")); } @Test public void testSetCountInteger() { - Integer count = new Integer(87); + Integer count = Integer.valueOf(87); BrowseItem item = new BrowseItem("", ""); item.setCount(count); assertEquals(count, item.get("count")); @@ -229,7 +229,7 @@ public void testGetFields() @Test public void testGetCount() { - Integer count = new Integer(87); + Integer count = Integer.valueOf(87); BrowseItem item = new BrowseItem("", ""); item.setCount(count); assertEquals(count, item.getCount());