From df9bd80d74b72aacf336d4f1a4e44ddaff2757ba Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Sun, 16 Jul 2023 18:59:05 +0000 Subject: [PATCH 1/3] reimplement C string literals --- compiler/rustc_lexer/src/cursor.rs | 4 ++ compiler/rustc_lexer/src/lib.rs | 7 ++++ compiler/rustc_parse/src/lexer/mod.rs | 26 +++++++++++- .../rfcs/rfc-3348-c-string-literals/basic.rs | 3 +- .../rfc-3348-c-string-literals/basic.stderr | 25 ------------ .../rfc-3348-c-string-literals/gate.stderr | 31 +++++--------- .../rfc-3348-c-string-literals/no-nuls.rs | Bin 760 -> 623 bytes .../rfc-3348-c-string-literals/no-nuls.stderr | Bin 4477 -> 674 bytes .../rfc-3348-c-string-literals/non-ascii.rs | 3 +- .../non-ascii.stderr | 38 ------------------ 10 files changed, 48 insertions(+), 89 deletions(-) delete mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/basic.stderr delete mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.stderr diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index eceef59802eb9..aba7f95487e9d 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -24,6 +24,10 @@ impl<'a> Cursor<'a> { } } + pub fn as_str(&self) -> &'a str { + self.chars.as_str() + } + /// Returns the last eaten symbol (or `'\0'` in release builds). /// (For debug assertions only.) pub(crate) fn prev(&self) -> char { diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 29335a8c0f4cd..d511d2b1280d9 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -367,6 +367,13 @@ impl Cursor<'_> { Some(|terminated| Byte { terminated }), ), + // c-string literal, raw c-string literal or identifier. + 'c' => self.c_or_byte_string( + |terminated| CStr { terminated }, + |n_hashes| RawCStr { n_hashes }, + None, + ), + // Identifier (this should be checked after other variant that can // start as identifier). c if is_id_start(c) => self.ident_or_unknown_prefix(), diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index c6e6b46e4551c..cfcc2ec42fac1 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -9,8 +9,8 @@ use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey}; use rustc_lexer::unescape::{self, EscapeError, Mode}; -use rustc_lexer::Cursor; use rustc_lexer::{Base, DocStyle, RawStrError}; +use rustc_lexer::{Cursor, LiteralKind}; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT, }; @@ -118,6 +118,7 @@ impl<'a> StringReader<'a> { let mut swallow_next_invalid = 0; // Skip trivial (whitespace & comments) tokens loop { + let str_before = self.cursor.as_str(); let token = self.cursor.advance_token(); let start = self.pos; self.pos = self.pos + BytePos(token.len); @@ -203,6 +204,29 @@ impl<'a> StringReader<'a> { .push(span); token::Ident(sym, false) } + // split up (raw) c string literals to an ident and a string literal when edition < 2021. + rustc_lexer::TokenKind::Literal { + kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }), + suffix_start: _, + } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => { + let prefix_len = match kind { + LiteralKind::CStr { .. } => 1, + LiteralKind::RawCStr { .. } => 2, + _ => unreachable!(), + }; + + // reset the state so that only the prefix ("c" or "cr") + // was consumed. + let lit_start = start + BytePos(prefix_len); + self.pos = lit_start; + self.cursor = Cursor::new(&str_before[prefix_len as usize..]); + + self.report_unknown_prefix(start); + let sym = nfc_normalize(self.str_from(start)); + let prefix_span = self.mk_sp(start, lit_start); + self.sess.symbol_gallery.insert(sym, prefix_span); + return (Token::new(token::Ident(sym, false), prefix_span), preceded_by_whitespace); + } rustc_lexer::TokenKind::Literal { kind, suffix_start } => { let suffix_start = start + BytePos(suffix_start); let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs index 3fc5fd481ea6d..5037396000bf0 100644 --- a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs @@ -1,5 +1,4 @@ -// FIXME(c_str_literals): This should be `run-pass` -// known-bug: #113333 +// run-pass // edition: 2021 #![feature(c_str_literals)] diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.stderr b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.stderr deleted file mode 100644 index 571c319d8c533..0000000000000 --- a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.stderr +++ /dev/null @@ -1,25 +0,0 @@ -error: prefix `c` is unknown - --> $DIR/basic.rs:8:27 - | -LL | assert_eq!(b"test\0", c"test".to_bytes_with_nul()); - | ^ unknown prefix - | - = note: prefixed identifiers and literals are reserved since Rust 2021 -help: consider inserting whitespace here - | -LL | assert_eq!(b"test\0", c "test".to_bytes_with_nul()); - | + - -error: no rules expected the token `"test"` - --> $DIR/basic.rs:8:28 - | -LL | assert_eq!(b"test\0", c"test".to_bytes_with_nul()); - | -^^^^^ - | | - | no rules expected this token in macro call - | help: missing comma here - | - = note: while trying to match sequence start - -error: aborting due to 2 previous errors - diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr index 8de36ca4a6edf..ea666e4330830 100644 --- a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr @@ -1,32 +1,21 @@ -error: prefix `c` is unknown +error[E0658]: `c".."` literals are experimental --> $DIR/gate.rs:10:5 | LL | c"foo"; - | ^ unknown prefix + | ^^^^^^ | - = note: prefixed identifiers and literals are reserved since Rust 2021 -help: consider inserting whitespace here - | -LL | c "foo"; - | + + = note: see issue #105723 for more information + = help: add `#![feature(c_str_literals)]` to the crate attributes to enable -error: prefix `c` is unknown +error[E0658]: `c".."` literals are experimental --> $DIR/gate.rs:13:8 | LL | m!(c"test"); - | ^ unknown prefix - | - = note: prefixed identifiers and literals are reserved since Rust 2021 -help: consider inserting whitespace here + | ^^^^^^^ | -LL | m!(c "test"); - | + - -error: expected one of `!`, `.`, `::`, `;`, `?`, `{`, `}`, or an operator, found `"foo"` - --> $DIR/gate.rs:10:6 - | -LL | c"foo"; - | ^^^^^ expected one of 8 possible tokens + = note: see issue #105723 for more information + = help: add `#![feature(c_str_literals)]` to the crate attributes to enable -error: aborting due to 3 previous errors +error: aborting due to 2 previous errors +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs index 96945f125da71a9097f6f26c75a328f097448568..369173e23184e4537fc5fb34dbe716a984667c07 100644 GIT binary patch delta 208 zcmeyt`krN?m=9-NoPujmkbjWEWNSugO@*AylGLKaoMJ961t3UPiYcu&FsN0s26OfG lQBkP6avy>(F#*#$vQdJuM{o6V+eFij;NsFeP9W4gLhJlZ*h zcamo&GjEc>i)`QI`QBDJL|Jy>0woEs12?>tIV{pehENo2N=A%T4AVjzRS0gFB0|T{ z2$d2V!X=KOL*Rmj+caGIqa&?vpYB)VS2;Y^T!XG#D-N({u#;8@pfyp1IM$&HE6Vz|75 zu3sW|Kibf;!t3k1hm|vd^MGO6{N({>hu?dnKfSxeX+CuLli#$j9$@dIcremQ8dzg_ P7LFCq9*JKyoYbRF@$lQ` diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr index 2226c7aa6a9ae9efc3b3cf7ce2270d90518dba9d..82d9f9cb3209164b185951574f52ddf533680441 100644 GIT binary patch literal 674 zcmcJNJqyAx5Qd%eD_)9gwF>p4f{QLfC+AX#ZO}kT$|VuO`rnNm+dAf^`+$4iJ6=Jg zDT8uNgX+PFS5}aLR5-y{lIo5|`W9~tI0-0iVO-m4vZ7`{93L@1U#?4~W52{K$Yh&r z>nG?~o}=>#P(?)&9r|Wp17*pW2*BP?vE4gKK0|)#BaDXCdP2l(zz>8@C(y+!8GS$4 g?Ypr6ivAwYZ{A2hlhsbZYNTjMJf(I9y*mtJU&n#TumAu6 literal 4477 zcmeI0T~FIE6o$FZuXw3;v1yd$L$naI?PfQVxZ)aBDd{P-$n3~=pcC-lXQy-`u3L2x zNQqc3j`HCouOH>SIX)z%kSR(@bGAg5XJ85yzvbed2f%Rn4Ih5|y!^t&kXMB|mMR^m zC*}$sgNqA1njIh?T!n-4`ec1PSwG?VxvSyk(pPYyY5h8yfz60(HfJOirhJBiX_C{z z97uvBMe@#!QjF(>%Svg)VH^buD#{eO;L6-D!MGWx8Nb2(!rZUQsTsW>Nv=#dI`KIO z=-&ZEp^rfI5Qa}Zp|GK~Es4ZK;-DW88Zz6?f8?3})iUf$HaJRs6yzR`5S?vekb?%` zNmo5VqA3(U2zxMyJq#SX=_-O(vc=@TEk6GtlV~#^nqOIWKZQotw# $DIR/non-ascii.rs:9:9 - | -LL | c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(), - | ^ unknown prefix - | - = note: prefixed identifiers and literals are reserved since Rust 2021 -help: consider inserting whitespace here - | -LL | c "\xEF\x80🦀\u{1F980}".to_bytes_with_nul(), - | + - -error: out of range hex escape - --> $DIR/non-ascii.rs:9:11 - | -LL | c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(), - | ^^^^ must be a character in the range [\x00-\x7f] - -error: out of range hex escape - --> $DIR/non-ascii.rs:9:15 - | -LL | c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(), - | ^^^^ must be a character in the range [\x00-\x7f] - -error: no rules expected the token `"\xEF\x80🦀\u{1F980}"` - --> $DIR/non-ascii.rs:9:10 - | -LL | c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(), - | -^^^^^^^^^^^^^^^^^^^^ - | | - | no rules expected this token in macro call - | help: missing comma here - | -note: while trying to match `,` - --> $SRC_DIR/core/src/macros/mod.rs:LL:COL - -error: aborting due to 4 previous errors - From 0d9c871736092fa804e1edf4e2d198cf2fe659df Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Sun, 23 Jul 2023 10:09:43 +0000 Subject: [PATCH 2/3] add proc macro test --- .../auxiliary/count.rs | 14 ++++++++++++++ .../rfc-3348-c-string-literals/edition-spans.rs | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/auxiliary/count.rs create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/auxiliary/count.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/auxiliary/count.rs new file mode 100644 index 0000000000000..0907061d64a1b --- /dev/null +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/auxiliary/count.rs @@ -0,0 +1,14 @@ +// force-host +// edition: 2018 +// no-prefer-dynamic +#![crate_type = "proc-macro"] + +extern crate proc_macro; + +use proc_macro::TokenStream; +use std::str::FromStr; + +#[proc_macro] +pub fn number_of_tokens(_: TokenStream) -> TokenStream { + TokenStream::from_str("c\"\"").unwrap().into_iter().count().to_string().parse().unwrap() +} diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs new file mode 100644 index 0000000000000..b3557c71b744e --- /dev/null +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/edition-spans.rs @@ -0,0 +1,16 @@ +// even if this crate is edition 2021, proc macros compiled using older +// editions should still be able to observe the pre-2021 token behavior +// +// adapted from tests/ui/rust-2021/reserved-prefixes-via-macro.rs + +// edition: 2021 +// check-pass + +// aux-build: count.rs +extern crate count; + +const _: () = { + assert!(count::number_of_tokens!() == 2); +}; + +fn main() {} From a0376e9ec2af2d996b828f6a6153c7f53b100b0b Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Tue, 25 Jul 2023 09:24:12 +0000 Subject: [PATCH 3/3] extract common code --- compiler/rustc_parse/src/lexer/mod.rs | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index cfcc2ec42fac1..1931ee5e528dd 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -166,10 +166,7 @@ impl<'a> StringReader<'a> { continue; } rustc_lexer::TokenKind::Ident => { - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - token::Ident(sym, false) + self.ident(start) } rustc_lexer::TokenKind::RawIdent => { let sym = nfc_normalize(self.str_from(start + BytePos(2))); @@ -183,10 +180,7 @@ impl<'a> StringReader<'a> { } rustc_lexer::TokenKind::UnknownPrefix => { self.report_unknown_prefix(start); - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - token::Ident(sym, false) + self.ident(start) } rustc_lexer::TokenKind::InvalidIdent // Do not recover an identifier with emoji if the codepoint is a confusable @@ -222,10 +216,8 @@ impl<'a> StringReader<'a> { self.cursor = Cursor::new(&str_before[prefix_len as usize..]); self.report_unknown_prefix(start); - let sym = nfc_normalize(self.str_from(start)); let prefix_span = self.mk_sp(start, lit_start); - self.sess.symbol_gallery.insert(sym, prefix_span); - return (Token::new(token::Ident(sym, false), prefix_span), preceded_by_whitespace); + return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace); } rustc_lexer::TokenKind::Literal { kind, suffix_start } => { let suffix_start = start + BytePos(suffix_start); @@ -341,6 +333,13 @@ impl<'a> StringReader<'a> { } } + fn ident(&self, start: BytePos) -> TokenKind { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) + } + fn struct_fatal_span_char( &self, from_pos: BytePos,