diff --git a/docs/compatibility.md b/docs/compatibility.md index a905349398c..e7fcaa5b8a0 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -484,15 +484,12 @@ These are the known edge cases where running on the GPU will produce different r next to a newline or a repetition that produces zero or more results ([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))` - Word and non-word boundaries, `\b` and `\B` -- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by - another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but - not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)). The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU. - Line anchors `^` and `$` are not supported in some contexts, such as when combined with a choice (`^|a` or `$|a`). - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. -- String anchor `\z` is not supported +- String anchor `\z` is not supported. - Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero or more results - Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D` diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 6204de274b0..0318f578e89 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -335,10 +335,11 @@ class RegularExpressionTranspilerSuite extends AnyFunSuite { } test("line anchor $ - find") { - val patterns = Seq("a$", "a$b", "\f$", "$\f") + val patterns = Seq("a$", "a$b", "\f$", "$\f","TEST$") val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028", - "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r", - "\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb") + "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r", + "\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb", + "TEST\u0085\n", "TEST\u0085\r", "TEST\u2028\r","TEST\u2028\u2029") assertCpuGpuMatchesRegexpFind(patterns, inputs) val unsupportedPatterns = Seq("[\r\n]?$", "$\r", "\r$", // "\u0085$", "\u2028$", "\u2029$", "\n$", "\r\n$", "[D$3]$")