Skip to content

Commit 5a68abb

Browse files
committed
Tokenize emoji as if they were valid indentifiers
In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors.
1 parent 311fa1f commit 5a68abb

File tree

8 files changed

+180
-2
lines changed

8 files changed

+180
-2
lines changed

Cargo.lock

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4040,6 +4040,7 @@ name = "rustc_lexer"
40404040
version = "0.1.0"
40414041
dependencies = [
40424042
"expect-test",
4043+
"unic-emoji-char",
40434044
"unicode-xid",
40444045
]
40454046

@@ -5510,6 +5511,47 @@ version = "0.1.3"
55105511
source = "registry+https://github.com/rust-lang/crates.io-index"
55115512
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
55125513

5514+
[[package]]
5515+
name = "unic-char-property"
5516+
version = "0.9.0"
5517+
source = "registry+https://github.com/rust-lang/crates.io-index"
5518+
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
5519+
dependencies = [
5520+
"unic-char-range",
5521+
]
5522+
5523+
[[package]]
5524+
name = "unic-char-range"
5525+
version = "0.9.0"
5526+
source = "registry+https://github.com/rust-lang/crates.io-index"
5527+
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
5528+
5529+
[[package]]
5530+
name = "unic-common"
5531+
version = "0.9.0"
5532+
source = "registry+https://github.com/rust-lang/crates.io-index"
5533+
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
5534+
5535+
[[package]]
5536+
name = "unic-emoji-char"
5537+
version = "0.9.0"
5538+
source = "registry+https://github.com/rust-lang/crates.io-index"
5539+
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
5540+
dependencies = [
5541+
"unic-char-property",
5542+
"unic-char-range",
5543+
"unic-ucd-version",
5544+
]
5545+
5546+
[[package]]
5547+
name = "unic-ucd-version"
5548+
version = "0.9.0"
5549+
source = "registry+https://github.com/rust-lang/crates.io-index"
5550+
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
5551+
dependencies = [
5552+
"unic-common",
5553+
]
5554+
55135555
[[package]]
55145556
name = "unicase"
55155557
version = "2.6.0"

compiler/rustc_interface/src/passes.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
3535
use rustc_session::search_paths::PathKind;
3636
use rustc_session::{Limit, Session};
3737
use rustc_span::symbol::{sym, Ident, Symbol};
38-
use rustc_span::FileName;
38+
use rustc_span::{FileName, MultiSpan};
3939
use rustc_trait_selection::traits;
4040
use rustc_typeck as typeck;
4141
use tempfile::Builder as TempFileBuilder;
@@ -450,6 +450,16 @@ pub fn configure_and_expand(
450450
});
451451
}
452452

453+
// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
454+
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
455+
for (ident, spans) in identifiers.drain() {
456+
sess.diagnostic().span_err(
457+
MultiSpan::from(spans),
458+
&format!("identifiers cannot contain emojis: `{}`", ident),
459+
);
460+
}
461+
});
462+
453463
Ok(krate)
454464
}
455465

compiler/rustc_lexer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ doctest = false
1717
# Note that this crate purposefully does not depend on other rustc crates
1818
[dependencies]
1919
unicode-xid = "0.2.0"
20+
unic-emoji-char = "0.9.0"
2021

2122
[dev-dependencies]
2223
expect-test = "1.0"

compiler/rustc_lexer/src/lib.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ pub enum TokenKind {
6464
/// "ident" or "continue"
6565
/// At this step keywords are also considered identifiers.
6666
Ident,
67+
/// Like the above, but containing invalid unicode codepoints.
68+
InvalidIdent,
6769
/// "r#ident"
6870
RawIdent,
6971
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -411,6 +413,11 @@ impl Cursor<'_> {
411413
let kind = Str { terminated };
412414
Literal { kind, suffix_start }
413415
}
416+
// Identifier (this should be checked after other variant that can
417+
// start as identifier).
418+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
419+
self.fake_ident_or_unknown_prefix()
420+
}
414421
_ => Unknown,
415422
};
416423
Token::new(token_kind, self.len_consumed())
@@ -492,10 +499,28 @@ impl Cursor<'_> {
492499
// we see a prefix here, it is definitely an unknown prefix.
493500
match self.first() {
494501
'#' | '"' | '\'' => UnknownPrefix,
502+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
503+
self.fake_ident_or_unknown_prefix()
504+
}
495505
_ => Ident,
496506
}
497507
}
498508

509+
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
510+
// Start is already eaten, eat the rest of identifier.
511+
self.eat_while(|c| {
512+
unicode_xid::UnicodeXID::is_xid_continue(c)
513+
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
514+
|| c == '\u{200d}'
515+
});
516+
// Known prefixes must have been handled earlier. So if
517+
// we see a prefix here, it is definitely an unknown prefix.
518+
match self.first() {
519+
'#' | '"' | '\'' => UnknownPrefix,
520+
_ => InvalidIdent,
521+
}
522+
}
523+
499524
fn number(&mut self, first_digit: char) -> LiteralKind {
500525
debug_assert!('0' <= self.prev() && self.prev() <= '9');
501526
let mut base = Base::Decimal;

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,12 @@ impl<'a> StringReader<'a> {
222222
}
223223
token::Ident(sym, is_raw_ident)
224224
}
225+
rustc_lexer::TokenKind::InvalidIdent => {
226+
let sym = nfc_normalize(self.str_from(start));
227+
let span = self.mk_sp(start, self.pos);
228+
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
229+
token::Ident(sym, false)
230+
}
225231
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
226232
let suffix_start = start + BytePos(suffix_start as u32);
227233
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);

compiler/rustc_session/src/parse.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,13 @@ pub struct ParseSess {
119119
pub config: CrateConfig,
120120
pub edition: Edition,
121121
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
122-
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
122+
/// Places where raw identifiers were used. This is used to avoid complaining about idents
123+
/// clashing with keywords in new editions.
123124
pub raw_identifier_spans: Lock<Vec<Span>>,
125+
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
126+
/// should be. Useful to avoid bad tokenization when encountering emojis. We group them to
127+
/// provide a single error per unique incorrect identifier.
128+
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
124129
source_map: Lrc<SourceMap>,
125130
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
126131
/// Contains the spans of block expressions that could have been incomplete based on the
@@ -160,6 +165,7 @@ impl ParseSess {
160165
edition: ExpnId::root().expn_data().edition,
161166
missing_fragment_specifiers: Default::default(),
162167
raw_identifier_spans: Lock::new(Vec::new()),
168+
bad_unicode_identifiers: Lock::new(Default::default()),
163169
source_map,
164170
buffered_lints: Lock::new(vec![]),
165171
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
struct ABig👩‍👩‍👧‍👧Family; //~ ERROR identifiers cannot contain emojis
2+
struct 👀; //~ ERROR identifiers cannot contain emojis
3+
impl 👀 {
4+
fn full_of_() -> 👀 { //~ ERROR identifiers cannot contain emojis
5+
👀
6+
}
7+
}
8+
fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emojis
9+
👀::full_of() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
10+
//~^ ERROR identifiers cannot contain emojis
11+
}
12+
fn main() {
13+
let _ = i_like_to_😄_a_lot(); //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
14+
//~^ ERROR identifiers cannot contain emojis
15+
}
16+
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
2+
--> $DIR/emoji-identifiers.rs:13:13
3+
|
4+
LL | fn i_like_to_😅_a_lot() -> 👀 {
5+
| ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
6+
...
7+
LL | let _ = i_like_to_😄_a_lot();
8+
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
9+
10+
error: identifiers cannot contain emojis: `i_like_to_😄_a_lot`
11+
--> $DIR/emoji-identifiers.rs:13:13
12+
|
13+
LL | let _ = i_like_to_😄_a_lot();
14+
| ^^^^^^^^^^^^^^^^^^
15+
16+
error: identifiers cannot contain emojis: `full_of_✨`
17+
--> $DIR/emoji-identifiers.rs:4:8
18+
|
19+
LL | fn full_of_✨() -> 👀 {
20+
| ^^^^^^^^^^
21+
22+
error: identifiers cannot contain emojis: `full_of✨`
23+
--> $DIR/emoji-identifiers.rs:9:8
24+
|
25+
LL | 👀::full_of✨()
26+
| ^^^^^^^^^
27+
28+
error: identifiers cannot contain emojis: `👀`
29+
--> $DIR/emoji-identifiers.rs:2:8
30+
|
31+
LL | struct 👀;
32+
| ^^
33+
LL | impl 👀 {
34+
| ^^
35+
LL | fn full_of_✨() -> 👀 {
36+
| ^^
37+
LL | 👀
38+
| ^^
39+
...
40+
LL | fn i_like_to_😅_a_lot() -> 👀 {
41+
| ^^
42+
LL | 👀::full_of✨()
43+
| ^^
44+
45+
error: identifiers cannot contain emojis: `i_like_to_😅_a_lot`
46+
--> $DIR/emoji-identifiers.rs:8:4
47+
|
48+
LL | fn i_like_to_😅_a_lot() -> 👀 {
49+
| ^^^^^^^^^^^^^^^^^^
50+
51+
error: identifiers cannot contain emojis: `ABig👩‍👩‍👧‍👧Family`
52+
--> $DIR/emoji-identifiers.rs:1:8
53+
|
54+
LL | struct ABig👩‍👩‍👧‍👧Family;
55+
| ^^^^^^^^^^^^^^^^^^
56+
57+
error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
58+
--> $DIR/emoji-identifiers.rs:9:8
59+
|
60+
LL | struct 👀;
61+
| ---------- function or associated item `full_of✨` not found for this
62+
...
63+
LL | 👀::full_of✨()
64+
| ^^^^^^^^^
65+
| |
66+
| function or associated item not found in `👀`
67+
| help: there is an associated function with a similar name: `full_of_✨`
68+
69+
error: aborting due to 8 previous errors
70+
71+
Some errors have detailed explanations: E0425, E0599.
72+
For more information about an error, try `rustc --explain E0425`.

0 commit comments

Comments
 (0)