From b104612726d94cfd265e7abe87222e9b8a5aba58 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Wed, 30 Oct 2024 11:29:44 -0700 Subject: [PATCH] resolve issue: \Zx where x is any escape sequence Signed-off-by: Suraj Aralihalli --- docs/compatibility.md | 4 +++- .../src/main/scala/com/nvidia/spark/rapids/RegexParser.scala | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index e7fcaa5b8a0..67fb9e9ea2c 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -490,8 +490,10 @@ The following regular expression patterns are not yet supported on the GPU and w - Line anchors `^` and `$` are not supported in some contexts, such as when combined with a choice (`^|a` or `$|a`). - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\z` is not supported. -- Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero +- Patterns containing an end-of-line or string anchor immediately next to a newline or repetition that produces zero or more results +- Patterns containing end-of-line anchors like `$` or `\Z` immediately followed by + escape sequences (e.g., `\w`, `\b`) are not supported. - Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D` - Line and string anchors are not supported by `string_split` and `str_to_map` - Lazy quantifiers within a choice block such as `(2|\u2029??)+` diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index d1b6beb2095..8dc6e3a748c 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -1395,9 +1395,10 @@ class CudfRegexTranspiler(mode: RegexMode) { // however, cudf doesn't support negative look ahead throw new RegexUnsupportedException("Regex sequence $\\z is not supported", part.position) - case RegexEscaped(a) if "bB".contains(a) => + case RegexEscaped(a) if "bBsSdDwWaAf".contains(a) => throw new RegexUnsupportedException( - "Regex sequences with \\b or \\B not supported around $", part.position) + s"Regex sequences with \\$a are not supported around end-of-line markers " + + "like $ or \\Z at position", part.position) case _ => r.append(rewrite(part, replacement, last, flags)) }