From af3942eeddf958879ee041f4996b0fb7e218f3f4 Mon Sep 17 00:00:00 2001 From: Damien Date: Thu, 7 Nov 2024 10:50:31 -0500 Subject: [PATCH 1/2] Fixed TitleNormalizer for Chinese characters --- src/main/java/org/vufind/util/TitleNormalizer.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/vufind/util/TitleNormalizer.java b/src/main/java/org/vufind/util/TitleNormalizer.java index 7324516..503c226 100644 --- a/src/main/java/org/vufind/util/TitleNormalizer.java +++ b/src/main/java/org/vufind/util/TitleNormalizer.java @@ -5,14 +5,16 @@ import org.solrmarc.index.extractor.formatter.FieldFormatter; import org.solrmarc.tools.DataUtil; -public class TitleNormalizer implements Normalizer +public class TitleNormalizer extends ICUCollatorNormalizer { @Override public byte[] normalize(String s) { EnumSet cleanValue = DataUtil.getCleanValForParam("titleSortLower"); String normalizedTitle = DataUtil.cleanByVal(s, cleanValue); - byte[] bytes = normalizedTitle == null ? null : normalizedTitle.getBytes(); - return bytes; + if (normalizedTitle == null) { + return null; + } + return collator.getCollationKey(normalizedTitle).toByteArray(); } } From 6eccf32726237a92817f03dbd9fcee040f0d7d76 Mon Sep 17 00:00:00 2001 From: Damien Date: Thu, 7 Nov 2024 14:56:07 -0500 Subject: [PATCH 2/2] Added TitleNormalizerTest --- .../browse/tests/TitleNormalizerTest.java | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 tests/org/vufind/solr/browse/tests/TitleNormalizerTest.java diff --git a/tests/org/vufind/solr/browse/tests/TitleNormalizerTest.java b/tests/org/vufind/solr/browse/tests/TitleNormalizerTest.java new file mode 100644 index 0000000..74df73d --- /dev/null +++ b/tests/org/vufind/solr/browse/tests/TitleNormalizerTest.java @@ -0,0 +1,121 @@ +package org.vufind.solr.browse.tests; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Collections; +import java.util.List; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +import org.vufind.util.TitleNormalizer; + +public class TitleNormalizerTest +{ + private TitleNormalizer titleNormalizer; + + @Before + public void setUp() + { + titleNormalizer = new TitleNormalizer(); + } + + + @Test + public void sortsSimpleStrings() + { + assertEquals(listOf("apple", "banana", "cherry", "orange"), + sort(listOf("banana", "orange", "apple", "cherry"))); + } + + + @Test + public void sortsDiacriticStrings() + { + assertEquals(listOf("AAA", "Äardvark", "Apple", "Banana", "grapefruit", "Orange"), + sort(listOf("grapefruit", "Apple", "Orange", "AAA", "Äardvark", "Banana"))); + } + + + @Test + public void handlesHyphensQuotesAndWhitespace() + { + assertEquals(listOf("AAA", "Äardvark", "Apple", "Banana", "grapefruit", + "\"Hyphenated-words and double quotes\"", + " inappropriate leading space", + "Orange"), + sort(listOf("Orange", + "\"Hyphenated-words and double quotes\"", + "Banana", "grapefruit", + " inappropriate leading space", + "Äardvark", "Apple", "AAA"))); + + } + + + @Test + public void sortsUnicodeCharacters() + { + assertEquals(listOf("apple", "바나나", "チェリー", "橙子"), + sort(listOf("바나나", "橙子", "apple", "チェリー"))); + } + + + @Test + public void ignoresPunctuationMixedWithSpaces() + { + assertArrayEquals(titleNormalizer.normalize("wharton, edith"), titleNormalizer.normalize("wharton edith")); + assertArrayEquals(titleNormalizer.normalize("st. john"), titleNormalizer.normalize("st john")); + } + + + // + // Helpers + // + + private List listOf(String ... args) + { + List result = new ArrayList (); + for (String s : args) { + result.add(s); + } + + return result; + } + + + // http://stackoverflow.com/questions/5108091/java-comparator-for-byte-array-lexicographic + private int compareByteArrays(byte[] left, byte[] right) + { + for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) { + int a = (left[i] & 0xff); + int b = (right[j] & 0xff); + if (a != b) { + return a - b; + } + } + return left.length - right.length; + } + + + private List sort(List list) + { + List result = new ArrayList (); + result.addAll(list); + + Collections.sort(result, new Comparator () { + public int compare(String s1, String s2) { + return compareByteArrays(titleNormalizer.normalize(s1), + titleNormalizer.normalize(s2)); + } + }); + + return result; + } + +}