Skip to content

Commit cf682e0

Browse files
committed
Use NonZeroU32 instead of u32 within Symbol.
This shrinks `Option<Symbol>` from 8 bytes to 4 bytes, which shrinks `Token` from 24 bytes to 16 bytes. This reduces instruction counts by up to 1% across a range of benchmarks. The commit introduces a new type, `SymbolVec`, to encapsulate the 1-indexing now required to inter-operate with the non-zero indices.
1 parent b755501 commit cf682e0

File tree

3 files changed

+131
-89
lines changed

3 files changed

+131
-89
lines changed

src/libsyntax/parse/token.rs

+4
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ pub enum Token {
207207
Eof,
208208
}
209209

210+
// `Token` is used a lot. Make sure it doesn't unintentionally get bigger.
211+
#[cfg(target_arch = "x86_64")]
212+
static_assert!(MEM_SIZE_OF_STATEMENT: mem::size_of::<Token>() == 16);
213+
210214
impl Token {
211215
pub fn interpolated(nt: Nonterminal) -> Token {
212216
Token::Interpolated(Lrc::new((nt, LazyTokenStream::new())))

src/libsyntax_pos/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#![cfg_attr(not(stage0), feature(stdsimd))]
2929

3030
extern crate arena;
31+
#[macro_use]
3132
extern crate rustc_data_structures;
3233

3334
#[macro_use]

src/libsyntax_pos/symbol.rs

+126-89
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use std::fmt;
2020
use std::str;
2121
use std::cmp::{PartialEq, Ordering, PartialOrd, Ord};
2222
use std::hash::{Hash, Hasher};
23+
use std::num::NonZeroU32;
2324

2425
use hygiene::SyntaxContext;
2526
use {Span, DUMMY_SP, GLOBALS};
@@ -143,9 +144,10 @@ impl Decodable for Ident {
143144
}
144145
}
145146

146-
/// A symbol is an interned or gensymed string.
147+
/// A symbol is an interned or gensymed string. It's a NonZeroU32 so that
148+
/// Option<Symbol> only takes up 4 bytes.
147149
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
148-
pub struct Symbol(u32);
150+
pub struct Symbol(NonZeroU32);
149151

150152
// The interner is pointed to by a thread local value which is only set on the main thread
151153
// with parallelization is disabled. So we don't allow `Symbol` to transfer between threads
@@ -188,8 +190,9 @@ impl Symbol {
188190
})
189191
}
190192

193+
#[inline(always)]
191194
pub fn as_u32(self) -> u32 {
192-
self.0
195+
self.0.get()
193196
}
194197
}
195198

@@ -228,12 +231,36 @@ impl<T: ::std::ops::Deref<Target=str>> PartialEq<T> for Symbol {
228231
}
229232
}
230233

234+
/// Symbols (which are 1-indexed) index into this (which is 0-indexed
235+
/// internally). The methods handle the index conversions.
236+
#[derive(Default)]
237+
pub struct SymbolVec(Vec<&'static str>);
238+
239+
impl SymbolVec {
240+
#[inline]
241+
fn new_symbol(&mut self, s: &'static str) -> Symbol {
242+
self.0.push(s);
243+
// self.0.len() cannot be zero because of the push above.
244+
Symbol(unsafe { NonZeroU32::new_unchecked(self.0.len() as u32) })
245+
}
246+
247+
#[inline]
248+
fn get(&self, sym: Symbol) -> Option<&&'static str> {
249+
self.0.get(sym.0.get() as usize - 1)
250+
}
251+
252+
#[inline]
253+
fn contains(&self, sym: Symbol) -> bool {
254+
sym.0.get() as usize <= self.0.len()
255+
}
256+
}
257+
231258
// The `&'static str`s in this type actually point into the arena.
232259
#[derive(Default)]
233260
pub struct Interner {
234261
arena: DroplessArena,
235262
names: FxHashMap<&'static str, Symbol>,
236-
strings: Vec<&'static str>,
263+
strings: SymbolVec,
237264
gensyms: Vec<Symbol>,
238265
}
239266

@@ -243,9 +270,8 @@ impl Interner {
243270
for &string in init {
244271
if string == "" {
245272
// We can't allocate empty strings in the arena, so handle this here.
246-
let name = Symbol(this.strings.len() as u32);
273+
let name = this.strings.new_symbol("");
247274
this.names.insert("", name);
248-
this.strings.push("");
249275
} else {
250276
this.intern(string);
251277
}
@@ -258,8 +284,6 @@ impl Interner {
258284
return name;
259285
}
260286

261-
let name = Symbol(self.strings.len() as u32);
262-
263287
// `from_utf8_unchecked` is safe since we just allocated a `&str` which is known to be
264288
// UTF-8.
265289
let string: &str = unsafe {
@@ -270,16 +294,17 @@ impl Interner {
270294
let string: &'static str = unsafe {
271295
&*(string as *const str)
272296
};
273-
self.strings.push(string);
297+
298+
let name = self.strings.new_symbol(string);
274299
self.names.insert(string, name);
275300
name
276301
}
277302

278303
pub fn interned(&self, symbol: Symbol) -> Symbol {
279-
if (symbol.0 as usize) < self.strings.len() {
304+
if self.strings.contains(symbol) {
280305
symbol
281306
} else {
282-
self.interned(self.gensyms[(!0 - symbol.0) as usize])
307+
self.interned(self.gensyms[(!0 - symbol.as_u32()) as usize])
283308
}
284309
}
285310

@@ -290,17 +315,17 @@ impl Interner {
290315

291316
fn gensymed(&mut self, symbol: Symbol) -> Symbol {
292317
self.gensyms.push(symbol);
293-
Symbol(!0 - self.gensyms.len() as u32 + 1)
318+
Symbol(NonZeroU32::new(!0 - self.gensyms.len() as u32 + 1).unwrap())
294319
}
295320

296321
fn is_gensymed(&mut self, symbol: Symbol) -> bool {
297-
symbol.0 as usize >= self.strings.len()
322+
!self.strings.contains(symbol)
298323
}
299324

300325
pub fn get(&self, symbol: Symbol) -> &str {
301-
match self.strings.get(symbol.0 as usize) {
326+
match self.strings.get(symbol) {
302327
Some(string) => string,
303-
None => self.get(self.gensyms[(!0 - symbol.0) as usize]),
328+
None => self.get(self.gensyms[(!0 - symbol.as_u32()) as usize]),
304329
}
305330
}
306331
}
@@ -313,6 +338,8 @@ macro_rules! declare_keywords {(
313338
) => {
314339
pub mod keywords {
315340
use super::{Symbol, Ident};
341+
use std::num::NonZeroU32;
342+
316343
#[derive(Clone, Copy, PartialEq, Eq)]
317344
pub struct Keyword {
318345
ident: Ident,
@@ -321,10 +348,17 @@ macro_rules! declare_keywords {(
321348
#[inline] pub fn ident(self) -> Ident { self.ident }
322349
#[inline] pub fn name(self) -> Symbol { self.ident.name }
323350
}
351+
// We must use `NonZeroU32::new_unchecked` below because it's `const`
352+
// and `NonZeroU32::new` is not. So we static_assert the non-zeroness
353+
// here.
354+
mod asserts {
355+
$(static_assert!($konst: $index > 0u32);)*
356+
}
324357
$(
325358
#[allow(non_upper_case_globals)]
326359
pub const $konst: Keyword = Keyword {
327-
ident: Ident::with_empty_ctxt(super::Symbol($index))
360+
ident: Ident::with_empty_ctxt(
361+
super::Symbol(unsafe { NonZeroU32::new_unchecked($index) }))
328362
};
329363
)*
330364

@@ -355,79 +389,80 @@ macro_rules! declare_keywords {(
355389
declare_keywords! {
356390
// Special reserved identifiers used internally for elided lifetimes,
357391
// unnamed method parameters, crate root module, error recovery etc.
358-
(0, Invalid, "")
359-
(1, PathRoot, "{{root}}")
360-
(2, DollarCrate, "$crate")
361-
(3, Underscore, "_")
392+
// (0 cannot be used because Symbol uses NonZeroU32)
393+
(1, Invalid, "")
394+
(2, PathRoot, "{{root}}")
395+
(3, DollarCrate, "$crate")
396+
(4, Underscore, "_")
362397

363398
// Keywords that are used in stable Rust.
364-
(4, As, "as")
365-
(5, Box, "box")
366-
(6, Break, "break")
367-
(7, Const, "const")
368-
(8, Continue, "continue")
369-
(9, Crate, "crate")
370-
(10, Else, "else")
371-
(11, Enum, "enum")
372-
(12, Extern, "extern")
373-
(13, False, "false")
374-
(14, Fn, "fn")
375-
(15, For, "for")
376-
(16, If, "if")
377-
(17, Impl, "impl")
378-
(18, In, "in")
379-
(19, Let, "let")
380-
(20, Loop, "loop")
381-
(21, Match, "match")
382-
(22, Mod, "mod")
383-
(23, Move, "move")
384-
(24, Mut, "mut")
385-
(25, Pub, "pub")
386-
(26, Ref, "ref")
387-
(27, Return, "return")
388-
(28, SelfLower, "self")
389-
(29, SelfUpper, "Self")
390-
(30, Static, "static")
391-
(31, Struct, "struct")
392-
(32, Super, "super")
393-
(33, Trait, "trait")
394-
(34, True, "true")
395-
(35, Type, "type")
396-
(36, Unsafe, "unsafe")
397-
(37, Use, "use")
398-
(38, Where, "where")
399-
(39, While, "while")
399+
(5, As, "as")
400+
(6, Box, "box")
401+
(7, Break, "break")
402+
(8, Const, "const")
403+
(9, Continue, "continue")
404+
(10, Crate, "crate")
405+
(11, Else, "else")
406+
(12, Enum, "enum")
407+
(13, Extern, "extern")
408+
(14, False, "false")
409+
(15, Fn, "fn")
410+
(16, For, "for")
411+
(17, If, "if")
412+
(18, Impl, "impl")
413+
(19, In, "in")
414+
(20, Let, "let")
415+
(21, Loop, "loop")
416+
(22, Match, "match")
417+
(23, Mod, "mod")
418+
(24, Move, "move")
419+
(25, Mut, "mut")
420+
(26, Pub, "pub")
421+
(27, Ref, "ref")
422+
(28, Return, "return")
423+
(29, SelfLower, "self")
424+
(30, SelfUpper, "Self")
425+
(31, Static, "static")
426+
(32, Struct, "struct")
427+
(33, Super, "super")
428+
(34, Trait, "trait")
429+
(35, True, "true")
430+
(36, Type, "type")
431+
(37, Unsafe, "unsafe")
432+
(38, Use, "use")
433+
(39, Where, "where")
434+
(40, While, "while")
400435

401436
// Keywords that are used in unstable Rust or reserved for future use.
402-
(40, Abstract, "abstract")
403-
(41, Become, "become")
404-
(42, Do, "do")
405-
(43, Final, "final")
406-
(44, Macro, "macro")
407-
(45, Override, "override")
408-
(46, Priv, "priv")
409-
(47, Typeof, "typeof")
410-
(48, Unsized, "unsized")
411-
(49, Virtual, "virtual")
412-
(50, Yield, "yield")
437+
(41, Abstract, "abstract")
438+
(42, Become, "become")
439+
(43, Do, "do")
440+
(44, Final, "final")
441+
(45, Macro, "macro")
442+
(46, Override, "override")
443+
(47, Priv, "priv")
444+
(48, Typeof, "typeof")
445+
(49, Unsized, "unsized")
446+
(50, Virtual, "virtual")
447+
(51, Yield, "yield")
413448

414449
// Edition-specific keywords that are used in stable Rust.
415-
(51, Dyn, "dyn") // >= 2018 Edition only
450+
(52, Dyn, "dyn") // >= 2018 Edition only
416451

417452
// Edition-specific keywords that are used in unstable Rust or reserved for future use.
418-
(52, Async, "async") // >= 2018 Edition only
419-
(53, Try, "try") // >= 2018 Edition only
453+
(53, Async, "async") // >= 2018 Edition only
454+
(54, Try, "try") // >= 2018 Edition only
420455

421456
// Special lifetime names
422-
(54, UnderscoreLifetime, "'_")
423-
(55, StaticLifetime, "'static")
457+
(55, UnderscoreLifetime, "'_")
458+
(56, StaticLifetime, "'static")
424459

425460
// Weak keywords, have special meaning only in specific contexts.
426-
(56, Auto, "auto")
427-
(57, Catch, "catch")
428-
(58, Default, "default")
429-
(59, Existential, "existential")
430-
(60, Union, "union")
461+
(57, Auto, "auto")
462+
(58, Catch, "catch")
463+
(59, Default, "default")
464+
(60, Existential, "existential")
465+
(61, Union, "union")
431466
}
432467

433468
impl Symbol {
@@ -708,20 +743,22 @@ mod tests {
708743
#[test]
709744
fn interner_tests() {
710745
let mut i: Interner = Interner::default();
711-
// first one is zero:
712-
assert_eq!(i.intern("dog"), Symbol(0));
746+
let nz = |n| NonZeroU32::new(n).unwrap();
747+
748+
// first one is 1:
749+
assert_eq!(i.intern("dog"), Symbol(nz(1)));
713750
// re-use gets the same entry:
714-
assert_eq!(i.intern("dog"), Symbol(0));
715-
// different string gets a different #:
716-
assert_eq!(i.intern("cat"), Symbol(1));
717-
assert_eq!(i.intern("cat"), Symbol(1));
718-
// dog is still at zero
719-
assert_eq!(i.intern("dog"), Symbol(0));
720-
assert_eq!(i.gensym("zebra"), Symbol(4294967295));
721-
// gensym of same string gets new number :
722-
assert_eq!(i.gensym("zebra"), Symbol(4294967294));
751+
assert_eq!(i.intern("dog"), Symbol(nz(1)));
752+
// different string gets a different number:
753+
assert_eq!(i.intern("cat"), Symbol(nz(2)));
754+
assert_eq!(i.intern("cat"), Symbol(nz(2)));
755+
// dog is still at 1
756+
assert_eq!(i.intern("dog"), Symbol(nz(1)));
757+
assert_eq!(i.gensym("zebra"), Symbol(nz(4294967295)));
758+
// gensym of same string gets new number:
759+
assert_eq!(i.gensym("zebra"), Symbol(nz(4294967294)));
723760
// gensym of *existing* string gets new number:
724-
assert_eq!(i.gensym("dog"), Symbol(4294967293));
761+
assert_eq!(i.gensym("dog"), Symbol(nz(4294967293)));
725762
}
726763

727764
#[test]

0 commit comments

Comments
 (0)