diff --git a/resources/bundles/org.eclipse.core.resources/src/org/eclipse/core/internal/utils/TextMatcher.java b/resources/bundles/org.eclipse.core.resources/src/org/eclipse/core/internal/utils/TextMatcher.java new file mode 100644 index 00000000000..4fec70a8ebb --- /dev/null +++ b/resources/bundles/org.eclipse.core.resources/src/org/eclipse/core/internal/utils/TextMatcher.java @@ -0,0 +1,169 @@ +/******************************************************************************* + * Copyright (c) 2020 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.core.internal.utils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.regex.Pattern; +import org.eclipse.core.text.StringMatcher; + +/** + * Similar to {@link StringMatcher}, this {@code TextMatcher} matches a pattern + * that may contain the wildcards '?' or '*' against a text. However, the + * matching is not only done on the full text, but also on individual words from + * the text, and if the pattern contains whitespace, the pattern is split into + * sub-patterns and those are matched, too. + *

+ * The precise rules are: + *

+ * + *

+ * An empty pattern matches only the empty text. + *

+ */ +public final class TextMatcher { + + private static final Pattern NON_WORD = Pattern.compile("\\W+", Pattern.UNICODE_CHARACTER_CLASS); //$NON-NLS-1$ + + private final StringMatcher full; + + private final List parts; + + /** + * Creates a new {@link TextMatcher}. + * + * @param pattern to match + * @param ignoreCase whether to do case-insensitive matching + * @param ignoreWildCards whether to treat '?' and '*' as normal characters, not + * as wildcards + * @throws IllegalArgumentException if {@code pattern == null} + */ + public TextMatcher(String pattern, boolean ignoreCase, boolean ignoreWildCards) { + full = new StringMatcher(pattern.trim(), ignoreCase, ignoreWildCards); + parts = splitPattern(pattern, ignoreCase, ignoreWildCards); + } + + private List splitPattern(String pattern, + boolean ignoreCase, boolean ignoreWildCards) { + String pat = pattern.trim(); + if (pat.isEmpty()) { + return Collections.emptyList(); + } + String[] subPatterns = pat.split("\\s+"); //$NON-NLS-1$ + if (subPatterns.length <= 1) { + return Collections.emptyList(); + } + List matchers = new ArrayList<>(); + for (String s : subPatterns) { + if (s == null || s.isEmpty()) { + continue; + } + StringMatcher m = new StringMatcher(s, ignoreCase, ignoreWildCards); + m.usePrefixMatch(); + matchers.add(m); + } + return matchers; + } + + /** + * Determines whether the given {@code text} matches the pattern. + * + * @param text String to match; must not be {@code null} + * @return {@code true} if the whole {@code text} matches the pattern; + * {@code false} otherwise + * @throws IllegalArgumentException if {@code text == null} + */ + public boolean match(String text) { + if (text == null) { + throw new IllegalArgumentException(); + } + return match(text, 0, text.length()); + } + + /** + * Determines whether the given sub-string of {@code text} from {@code start} + * (inclusive) to {@code end} (exclusive) matches the pattern. + * + * @param text String to match in; must not be {@code null} + * @param start start index (inclusive) within {@code text} of the sub-string to + * match + * @param end end index (exclusive) within {@code text} of the sub-string to + * match + * @return {@code true} if the given slice of {@code text} matches the pattern; + * {@code false} otherwise + * @throws IllegalArgumentException if {@code text == null} + */ + public boolean match(String text, int start, int end) { + if (text == null) { + throw new IllegalArgumentException(); + } + if (start > end) { + return false; + } + int tlen = text.length(); + start = Math.max(0, start); + end = Math.min(end, tlen); + if (full.match(text, start, end)) { + return true; + } + String[] words = getWords(text.substring(start, end)); + if (match(full, words)) { + return true; + } + if (parts.isEmpty()) { + return false; + } + for (StringMatcher subMatcher : parts) { + if (!subMatcher.match(text, start, end) && !match(subMatcher, words)) { + return false; + } + } + return true; + } + + private boolean match(StringMatcher matcher, String[] words) { + return Arrays.stream(words).filter(Objects::nonNull).anyMatch(matcher::match); + } + + /** + * Splits a given text into words. + * + * @param text to split + * @return the words of the text + */ + public static String[] getWords(String text) { + // Previous implementations (in the removed StringMatcher) used the ICU + // BreakIterator to split the text. That worked well, but in 2020 it was decided + // to drop the dependency to the ICU library due to its size. The JDK + // BreakIterator splits differently, causing e.g. + // https://bugs.eclipse.org/bugs/show_bug.cgi?id=563121 . The NON_WORD regexp + // appears to work well for programming language text, but may give sub-optimal + // results for natural languages. See also + // https://bugs.eclipse.org/bugs/show_bug.cgi?id=90579 . + return NON_WORD.split(text); + } + + @Override + public String toString() { + return '[' + full.toString() + ',' + parts + ']'; + } +} diff --git a/resources/tests/org.eclipse.core.tests.resources/META-INF/MANIFEST.MF b/resources/tests/org.eclipse.core.tests.resources/META-INF/MANIFEST.MF index cbb54734ffd..9f542ab492a 100644 --- a/resources/tests/org.eclipse.core.tests.resources/META-INF/MANIFEST.MF +++ b/resources/tests/org.eclipse.core.tests.resources/META-INF/MANIFEST.MF @@ -2,7 +2,7 @@ Manifest-Version: 1.0 Bundle-ManifestVersion: 2 Bundle-Name: Eclipse Core Tests Resources Bundle-SymbolicName: org.eclipse.core.tests.resources; singleton:=true -Bundle-Version: 3.11.700.qualifier +Bundle-Version: 3.11.800.qualifier Bundle-Vendor: Eclipse.org Export-Package: org.eclipse.core.tests.filesystem, org.eclipse.core.tests.internal.alias, diff --git a/resources/tests/org.eclipse.core.tests.resources/pom.xml b/resources/tests/org.eclipse.core.tests.resources/pom.xml index 44f9740e18c..d5293064bf1 100644 --- a/resources/tests/org.eclipse.core.tests.resources/pom.xml +++ b/resources/tests/org.eclipse.core.tests.resources/pom.xml @@ -18,7 +18,7 @@ 4.35.0-SNAPSHOT org.eclipse.core.tests.resources - 3.11.700-SNAPSHOT + 3.11.800-SNAPSHOT eclipse-test-plugin diff --git a/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/AllUtilsTests.java b/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/AllUtilsTests.java index 795abffb352..713007595d6 100644 --- a/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/AllUtilsTests.java +++ b/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/AllUtilsTests.java @@ -20,6 +20,7 @@ @Suite @SelectClasses({ // ObjectMapTest.class, // - FileUtilTest.class, }) + FileUtilTest.class, // + TextMatcherTest.class }) public class AllUtilsTests { } diff --git a/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/TextMatcherTest.java b/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/TextMatcherTest.java new file mode 100644 index 00000000000..cf67185e9af --- /dev/null +++ b/resources/tests/org.eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/TextMatcherTest.java @@ -0,0 +1,108 @@ +/******************************************************************************* + * Copyright (c) 2020 Thomas Wolf and others. + * + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + *******************************************************************************/ +package org.eclipse.core.tests.internal.utils; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.eclipse.core.internal.utils.TextMatcher; +import org.junit.Test; + +/** + * Tests for {@link TextMatcher}. + */ +public class TextMatcherTest { + + @Test + public void testEmpty() { + assertTrue(new TextMatcher("", false, false).match("")); + assertFalse(new TextMatcher("", false, false).match("foo")); + assertFalse(new TextMatcher("", false, false).match("foo bar baz")); + assertTrue(new TextMatcher("", false, true).match("")); + assertFalse(new TextMatcher("", false, true).match("foo")); + assertFalse(new TextMatcher("", false, true).match("foo bar baz")); + } + + @Test + public void testSuffixes() { + assertFalse(new TextMatcher("fo*ar", false, false).match("foobar_123")); + assertFalse(new TextMatcher("fo*ar", false, false).match("foobar_baz")); + } + + @Test + public void testChinese() { + assertTrue(new TextMatcher("喜欢", false, false).match("我 喜欢 吃 苹果。")); + // This test would work only if word-splitting used the ICU BreakIterator. + // "Words" are as shown above. + // assertTrue(new TextMatcher("喜欢", false, false).match("我喜欢吃苹果。")); + } + + @Test + public void testSingleWords() { + assertTrue(new TextMatcher("huhn", false, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("h?hner", false, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("h*hner", false, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("hühner", false, false).match("hahn henne hühner küken huhn")); + // Full pattern must match word fully + assertFalse(new TextMatcher("h?hner", false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new TextMatcher("h*hner", false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new TextMatcher("hühner", false, false).match("hahn henne hühnerhof küken huhn")); + + assertTrue(new TextMatcher("huhn", false, true).match("hahn henne hühner küken huhn")); + assertFalse(new TextMatcher("h?hner", false, true).match("hahn henne hühner küken huhn")); + assertFalse(new TextMatcher("h*hner", false, true).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("hühner", false, true).match("hahn henne hühner küken huhn")); + // Full pattern must match word fully + assertFalse(new TextMatcher("h?hner", false, true).match("hahn henne hühnerhof küken huhn")); + assertFalse(new TextMatcher("h*hner", false, true).match("hahn henne hühnerhof küken huhn")); + assertFalse(new TextMatcher("hühner", false, true).match("hahn henne hühnerhof küken huhn")); + + // Bug 570390: Pattern starting/ending with whitespace should still match + assertTrue(new TextMatcher("hahn ", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("huhn ", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher(" hahn", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher(" huhn", false, false).match("hahn henne hühnerhof küken huhn")); + } + + @Test + public void testMultipleWords() { + assertTrue(new TextMatcher("huhn h?hner", false, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("huhn h?hner", false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new TextMatcher("huhn h?hner", false, true).match("hahn henne hühner küken huhn")); + assertFalse(new TextMatcher("huhn h?hner", false, true).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("huhn h*hner", false, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("huhn h*hner", false, false).match("hahn henne hühnerhof küken huhn")); + assertFalse(new TextMatcher("huhn h*hner", false, true).match("hahn henne hühner küken huhn")); + assertFalse(new TextMatcher("huhn h*hner", false, true).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("huhn hühner", false, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("huhn hühner", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("huhn hühner", false, true).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("huhn hühner", false, true).match("hahn henne hühnerhof küken huhn")); + + // Bug 570390: Pattern starting/ending with whitespace should still match + assertTrue(new TextMatcher("huhn hahn ", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("hahn huhn ", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher(" huhn hahn", false, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher(" hahn huhn", false, false).match("hahn henne hühnerhof küken huhn")); + } + + @Test + public void testCaseInsensitivity() { + assertTrue(new TextMatcher("Huhn HÜHNER", true, false).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("Huhn HÜHNER", true, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("Huhn HÜHNER", true, true).match("hahn henne hühner küken huhn")); + assertTrue(new TextMatcher("Huhn HÜHNER", true, true).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("HüHnEr", true, false).match("hahn henne hühner küken huhn")); + assertFalse(new TextMatcher("HüHnEr", true, false).match("hahn henne hühnerhof küken huhn")); + assertTrue(new TextMatcher("HüHnEr", true, true).match("hahn henne hühner küken huhn")); + assertFalse(new TextMatcher("HüHnEr", true, true).match("hahn henne hühnerhof küken huhn")); + } +}