eclipse-platform · ptziegler · Dec 12, 2024 · Dec 12, 2024
diff --git a/...s/bundles/org.eclipse.core.resources/src/org/eclipse/core/internal/utils/TextMatcher.java b/...s/bundles/org.eclipse.core.resources/src/org/eclipse/core/internal/utils/TextMatcher.java
@@ -0,0 +1,169 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Thomas Wolf<[email protected]> and others.
+ *
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+package org.eclipse.core.internal.utils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.regex.Pattern;
+import org.eclipse.core.text.StringMatcher;
+
+/**
+ * Similar to {@link StringMatcher}, this {@code TextMatcher} matches a pattern
+ * that may contain the wildcards '?' or '*' against a text. However, the
+ * matching is not only done on the full text, but also on individual words from
+ * the text, and if the pattern contains whitespace, the pattern is split into
+ * sub-patterns and those are matched, too.
+ * <p>
+ * The precise rules are:
+ * </p>
+ * <ul>
+ * <li>Leading and trailing whitespace in the pattern is ignored.</li>
+ * <li>If the full pattern matches the full text, the match succeeds.</li>
+ * <li>If the full pattern matches a single word of the text, the match
+ * succeeds.</li>
+ * <li>If all sub-patterns match a prefix of the whole text or any prefix of any
+ * word, the match succeeds.</li>
+ * <li>Otherwise, the match fails.</li>
+ * </ul>
+ * <p>
+ * An empty pattern matches only the empty text.
+ * </p>
+ */
+public final class TextMatcher {
+
+	private static final Pattern NON_WORD = Pattern.compile("\\W+", Pattern.UNICODE_CHARACTER_CLASS); //$NON-NLS-1$
+
+	private final StringMatcher full;
+
+	private final List<StringMatcher> parts;
+
+	/**
+	 * Creates a new {@link TextMatcher}.
+	 *
+	 * @param pattern         to match
+	 * @param ignoreCase      whether to do case-insensitive matching
+	 * @param ignoreWildCards whether to treat '?' and '*' as normal characters, not
+	 *                        as wildcards
+	 * @throws IllegalArgumentException if {@code pattern == null}
+	 */
+	public TextMatcher(String pattern, boolean ignoreCase, boolean ignoreWildCards) {
+		full = new StringMatcher(pattern.trim(), ignoreCase, ignoreWildCards);
+		parts = splitPattern(pattern, ignoreCase, ignoreWildCards);
+	}
+
+	private List<StringMatcher> splitPattern(String pattern,
+			boolean ignoreCase, boolean ignoreWildCards) {
+		String pat = pattern.trim();
+		if (pat.isEmpty()) {
+			return Collections.emptyList();
+		}
+		String[] subPatterns = pat.split("\\s+"); //$NON-NLS-1$
+		if (subPatterns.length <= 1) {
+			return Collections.emptyList();
+		}
+		List<StringMatcher> matchers = new ArrayList<>();
+		for (String s : subPatterns) {
+			if (s == null || s.isEmpty()) {
+				continue;
+			}
+			StringMatcher m = new StringMatcher(s, ignoreCase, ignoreWildCards);
+			m.usePrefixMatch();
+			matchers.add(m);
+		}
+		return matchers;
+	}
+
+	/**
+	 * Determines whether the given {@code text} matches the pattern.
+	 *
+	 * @param text String to match; must not be {@code null}
+	 * @return {@code true} if the whole {@code text} matches the pattern;
+	 *         {@code false} otherwise
+	 * @throws IllegalArgumentException if {@code text == null}
+	 */
+	public boolean match(String text) {
+		if (text == null) {
+			throw new IllegalArgumentException();
+		}
+		return match(text, 0, text.length());
+	}
+
+	/**
+	 * Determines whether the given sub-string of {@code text} from {@code start}
+	 * (inclusive) to {@code end} (exclusive) matches the pattern.
+	 *
+	 * @param text  String to match in; must not be {@code null}
+	 * @param start start index (inclusive) within {@code text} of the sub-string to
+	 *              match
+	 * @param end   end index (exclusive) within {@code text} of the sub-string to
+	 *              match
+	 * @return {@code true} if the given slice of {@code text} matches the pattern;
+	 *         {@code false} otherwise
+	 * @throws IllegalArgumentException if {@code text == null}
+	 */
+	public boolean match(String text, int start, int end) {
+		if (text == null) {
+			throw new IllegalArgumentException();
+		}
+		if (start > end) {
+			return false;
+		}
+		int tlen = text.length();
+		start = Math.max(0, start);
+		end = Math.min(end, tlen);
+		if (full.match(text, start, end)) {
+			return true;
+		}
+		String[] words = getWords(text.substring(start, end));
+		if (match(full, words)) {
+			return true;
+		}
+		if (parts.isEmpty()) {
+			return false;
+		}
+		for (StringMatcher subMatcher : parts) {
+			if (!subMatcher.match(text, start, end) && !match(subMatcher, words)) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	private boolean match(StringMatcher matcher, String[] words) {
+		return Arrays.stream(words).filter(Objects::nonNull).anyMatch(matcher::match);
+	}
+
+	/**
+	 * Splits a given text into words.
+	 *
+	 * @param text to split
+	 * @return the words of the text
+	 */
+	public static String[] getWords(String text) {
+		// Previous implementations (in the removed StringMatcher) used the ICU
+		// BreakIterator to split the text. That worked well, but in 2020 it was decided
+		// to drop the dependency to the ICU library due to its size. The JDK
+		// BreakIterator splits differently, causing e.g.
+		// https://bugs.eclipse.org/bugs/show_bug.cgi?id=563121 . The NON_WORD regexp
+		// appears to work well for programming language text, but may give sub-optimal
+		// results for natural languages. See also
+		// https://bugs.eclipse.org/bugs/show_bug.cgi?id=90579 .
+		return NON_WORD.split(text);
+	}
+
+	@Override
+	public String toString() {
+		return '[' + full.toString() + ',' + parts + ']';
+	}
+}
diff --git a/resources/tests/org.eclipse.core.tests.resources/META-INF/MANIFEST.MF b/resources/tests/org.eclipse.core.tests.resources/META-INF/MANIFEST.MF
@@ -2,7 +2,7 @@ Manifest-Version: 1.0
 Bundle-ManifestVersion: 2
 Bundle-Name: Eclipse Core Tests Resources
 Bundle-SymbolicName: org.eclipse.core.tests.resources; singleton:=true
-Bundle-Version: 3.11.700.qualifier
+Bundle-Version: 3.11.800.qualifier
 Bundle-Vendor: Eclipse.org
 Export-Package: org.eclipse.core.tests.filesystem,
  org.eclipse.core.tests.internal.alias,

diff --git a/resources/tests/org.eclipse.core.tests.resources/pom.xml b/resources/tests/org.eclipse.core.tests.resources/pom.xml
@@ -18,7 +18,7 @@
     <version>4.35.0-SNAPSHOT</version>
   </parent>
   <artifactId>org.eclipse.core.tests.resources</artifactId>
-  <version>3.11.700-SNAPSHOT</version>
+  <version>3.11.800-SNAPSHOT</version>
   <packaging>eclipse-test-plugin</packaging>
 
   <properties>

diff --git a/...eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/AllUtilsTests.java b/...eclipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/AllUtilsTests.java
@@ -20,6 +20,7 @@
 @Suite
 @SelectClasses({ //
 		ObjectMapTest.class, //
-		FileUtilTest.class, })
+		FileUtilTest.class, //
+		TextMatcherTest.class })
 public class AllUtilsTests {
 }
diff --git a/...lipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/TextMatcherTest.java b/...lipse.core.tests.resources/src/org/eclipse/core/tests/internal/utils/TextMatcherTest.java
@@ -0,0 +1,108 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Thomas Wolf<[email protected]> and others.
+ *
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+package org.eclipse.core.tests.internal.utils;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.eclipse.core.internal.utils.TextMatcher;
+import org.junit.Test;
+
+/**
+ * Tests for {@link TextMatcher}.
+ */
+public class TextMatcherTest {
+
+	@Test
+	public void testEmpty() {
+		assertTrue(new TextMatcher("", false, false).match(""));
+		assertFalse(new TextMatcher("", false, false).match("foo"));
+		assertFalse(new TextMatcher("", false, false).match("foo bar baz"));
+		assertTrue(new TextMatcher("", false, true).match(""));
+		assertFalse(new TextMatcher("", false, true).match("foo"));
+		assertFalse(new TextMatcher("", false, true).match("foo bar baz"));
+	}
+
+	@Test
+	public void testSuffixes() {
+		assertFalse(new TextMatcher("fo*ar", false, false).match("foobar_123"));
+		assertFalse(new TextMatcher("fo*ar", false, false).match("foobar_baz"));
+	}
+
+	@Test
+	public void testChinese() {
+		assertTrue(new TextMatcher("喜欢", false, false).match("我 喜欢 吃 苹果。"));
+		// This test would work only if word-splitting used the ICU BreakIterator.
+		// "Words" are as shown above.
+		// assertTrue(new TextMatcher("喜欢", false, false).match("我喜欢吃苹果。"));
+	}
+
+	@Test
+	public void testSingleWords() {
+		assertTrue(new TextMatcher("huhn", false, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("h?hner", false, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("h*hner", false, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("hühner", false, false).match("hahn henne hühner küken huhn"));
+		// Full pattern must match word fully
+		assertFalse(new TextMatcher("h?hner", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertFalse(new TextMatcher("h*hner", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertFalse(new TextMatcher("hühner", false, false).match("hahn henne hühnerhof küken huhn"));
+
+		assertTrue(new TextMatcher("huhn", false, true).match("hahn henne hühner küken huhn"));
+		assertFalse(new TextMatcher("h?hner", false, true).match("hahn henne hühner küken huhn"));
+		assertFalse(new TextMatcher("h*hner", false, true).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("hühner", false, true).match("hahn henne hühner küken huhn"));
+		// Full pattern must match word fully
+		assertFalse(new TextMatcher("h?hner", false, true).match("hahn henne hühnerhof küken huhn"));
+		assertFalse(new TextMatcher("h*hner", false, true).match("hahn henne hühnerhof küken huhn"));
+		assertFalse(new TextMatcher("hühner", false, true).match("hahn henne hühnerhof küken huhn"));
+
+		// Bug 570390: Pattern starting/ending with whitespace should still match
+		assertTrue(new TextMatcher("hahn ", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("huhn ", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher(" hahn", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher(" huhn", false, false).match("hahn henne hühnerhof küken huhn"));
+	}
+
+	@Test
+	public void testMultipleWords() {
+		assertTrue(new TextMatcher("huhn h?hner", false, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("huhn h?hner", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertFalse(new TextMatcher("huhn h?hner", false, true).match("hahn henne hühner küken huhn"));
+		assertFalse(new TextMatcher("huhn h?hner", false, true).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("huhn h*hner", false, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("huhn h*hner", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertFalse(new TextMatcher("huhn h*hner", false, true).match("hahn henne hühner küken huhn"));
+		assertFalse(new TextMatcher("huhn h*hner", false, true).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("huhn hühner", false, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("huhn hühner", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("huhn hühner", false, true).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("huhn hühner", false, true).match("hahn henne hühnerhof küken huhn"));
+
+		// Bug 570390: Pattern starting/ending with whitespace should still match
+		assertTrue(new TextMatcher("huhn hahn ", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("hahn huhn ", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher(" huhn hahn", false, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher(" hahn huhn", false, false).match("hahn henne hühnerhof küken huhn"));
+	}
+
+	@Test
+	public void testCaseInsensitivity() {
+		assertTrue(new TextMatcher("Huhn HÜHNER", true, false).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("Huhn HÜHNER", true, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("Huhn HÜHNER", true, true).match("hahn henne hühner küken huhn"));
+		assertTrue(new TextMatcher("Huhn HÜHNER", true, true).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("HüHnEr", true, false).match("hahn henne hühner küken huhn"));
+		assertFalse(new TextMatcher("HüHnEr", true, false).match("hahn henne hühnerhof küken huhn"));
+		assertTrue(new TextMatcher("HüHnEr", true, true).match("hahn henne hühner küken huhn"));
+		assertFalse(new TextMatcher("HüHnEr", true, true).match("hahn henne hühnerhof küken huhn"));
+	}
+}