Skip to content

Optimize EscapeIterInner #125317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 77 additions & 91 deletions library/core/src/escape.rs
Original file line number Diff line number Diff line change
@@ -1,125 +1,107 @@
//! Helper code for character escaping.

use crate::ascii;
use crate::mem::MaybeUninit;
use crate::num::NonZero;
use crate::ops::Range;

const HEX_DIGITS: [ascii::Char; 16] = *b"0123456789abcdef".as_ascii().unwrap();

#[inline]
const fn backslash<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) {
const { assert!(N >= 2) };

let mut output = [ascii::Char::Null; N];

output[0] = ascii::Char::ReverseSolidus;
output[1] = a;

(output, 0..2)
}

/// Escapes an ASCII character.
///
/// Returns a buffer and the length of the escaped representation.
const fn escape_ascii<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
const { assert!(N >= 4) };

match byte {
b'\t' => backslash(ascii::Char::SmallT),
b'\r' => backslash(ascii::Char::SmallR),
b'\n' => backslash(ascii::Char::SmallN),
b'\\' => backslash(ascii::Char::ReverseSolidus),
b'\'' => backslash(ascii::Char::Apostrophe),
b'\"' => backslash(ascii::Char::QuotationMark),
byte => {
let mut output = [ascii::Char::Null; N];

if let Some(c) = byte.as_ascii()
&& !byte.is_ascii_control()
{
output[0] = c;
(output, 0..1)
} else {
let hi = HEX_DIGITS[(byte >> 4) as usize];
let lo = HEX_DIGITS[(byte & 0xf) as usize];

output[0] = ascii::Char::ReverseSolidus;
output[1] = ascii::Char::SmallX;
output[2] = hi;
output[3] = lo;

(output, 0..4)
}
}
}
}

/// Escapes a character `\u{NNNN}` representation.
///
/// Returns a buffer and the length of the escaped representation.
const fn escape_unicode<const N: usize>(c: char) -> ([ascii::Char; N], Range<u8>) {
const { assert!(N >= 10 && N < u8::MAX as usize) };

let c = u32::from(c);

// OR-ing `1` ensures that for `c == 0` the code computes that
// one digit should be printed.
let start = (c | 1).leading_zeros() as usize / 4 - 2;

let mut output = [ascii::Char::Null; N];
output[3] = HEX_DIGITS[((c >> 20) & 15) as usize];
output[4] = HEX_DIGITS[((c >> 16) & 15) as usize];
output[5] = HEX_DIGITS[((c >> 12) & 15) as usize];
output[6] = HEX_DIGITS[((c >> 8) & 15) as usize];
output[7] = HEX_DIGITS[((c >> 4) & 15) as usize];
output[8] = HEX_DIGITS[((c >> 0) & 15) as usize];
output[9] = ascii::Char::RightCurlyBracket;
output[start + 0] = ascii::Char::ReverseSolidus;
output[start + 1] = ascii::Char::SmallU;
output[start + 2] = ascii::Char::LeftCurlyBracket;

(output, (start as u8)..(N as u8))
}

/// An iterator over an fixed-size array.
///
/// This is essentially equivalent to array’s IntoIter except that indexes are
/// limited to u8 to reduce size of the structure.
#[derive(Clone, Debug)]
pub(crate) struct EscapeIterInner<const N: usize> {
// The element type ensures this is always ASCII, and thus also valid UTF-8.
data: [ascii::Char; N],
// Invariant: all elements inside the range indexed by `alive` are initialized
data: [MaybeUninit<ascii::Char>; N],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that, for small N, it's not obvious at all to me that this is necessarily a win. Zero-initializing a [ascii::Char; 8] for example is just a single 0_u64, and having it be MaybeUninit instead comes with the cost of it not getting noundef when it's passed around, reducing optimization possibilities for what LLVM can do with it.

For large, especially heap-allocated buffers it can help to avoid initializing them, but making all this code safety-critical (EscapeIterInner::ascii now has to be considered for understanding if it's UB, for example) is not at all obvious that it's a good tradeoff.

(We really need unsafe structs so that potential UB stops hiding in struct literals.)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The relevant N is 10, for \u{NNNNNN}.


// Invariant: `alive.start <= alive.end <= N`
alive: Range<u8>,
}

impl<const N: usize> EscapeIterInner<N> {
pub const fn backslash(c: ascii::Char) -> Self {
let (data, range) = backslash(c);
Self { data, alive: range }
const { assert!(N >= 2) };

let mut data = [MaybeUninit::uninit(); N];

data[0] = MaybeUninit::new(ascii::Char::ReverseSolidus);
data[1] = MaybeUninit::new(c);

Self { data, alive: 0..2 }
}

/// Escapes an ASCII character.
pub const fn ascii(c: u8) -> Self {
let (data, range) = escape_ascii(c);
Self { data, alive: range }
const { assert!(N >= 4) };

match c {
b'\t' => Self::backslash(ascii::Char::SmallT),
b'\r' => Self::backslash(ascii::Char::SmallR),
b'\n' => Self::backslash(ascii::Char::SmallN),
b'\\' => Self::backslash(ascii::Char::ReverseSolidus),
b'\'' => Self::backslash(ascii::Char::Apostrophe),
b'\"' => Self::backslash(ascii::Char::QuotationMark),
byte => {
let mut data = [MaybeUninit::uninit(); N];

if let Some(c) = byte.as_ascii()
&& !byte.is_ascii_control()
{
data[0] = MaybeUninit::new(c);
Self { data, alive: 0..1 }
} else {
let hi = HEX_DIGITS[(byte >> 4) as usize];
let lo = HEX_DIGITS[(byte & 0xf) as usize];

data[0] = MaybeUninit::new(ascii::Char::ReverseSolidus);
data[1] = MaybeUninit::new(ascii::Char::SmallX);
data[2] = MaybeUninit::new(hi);
data[3] = MaybeUninit::new(lo);

Self { data, alive: 0..4 }
}
}
}
}

/// Escapes a character `\u{NNNN}` representation.
pub const fn unicode(c: char) -> Self {
let (data, range) = escape_unicode(c);
Self { data, alive: range }
const { assert!(N >= 10 && N < u8::MAX as usize) };

let c = c as u32;

// OR-ing `1` ensures that for `c == 0` the code computes that
// one digit should be printed.
let start = (c | 1).leading_zeros() as usize / 4 - 2;

let mut data = [MaybeUninit::uninit(); N];
data[3] = MaybeUninit::new(HEX_DIGITS[((c >> 20) & 15) as usize]);
data[4] = MaybeUninit::new(HEX_DIGITS[((c >> 16) & 15) as usize]);
data[5] = MaybeUninit::new(HEX_DIGITS[((c >> 12) & 15) as usize]);
data[6] = MaybeUninit::new(HEX_DIGITS[((c >> 8) & 15) as usize]);
data[7] = MaybeUninit::new(HEX_DIGITS[((c >> 4) & 15) as usize]);
data[8] = MaybeUninit::new(HEX_DIGITS[((c >> 0) & 15) as usize]);
data[9] = MaybeUninit::new(ascii::Char::RightCurlyBracket);
data[start + 0] = MaybeUninit::new(ascii::Char::ReverseSolidus);
data[start + 1] = MaybeUninit::new(ascii::Char::SmallU);
data[start + 2] = MaybeUninit::new(ascii::Char::LeftCurlyBracket);

Self { data, alive: start as u8..10 }
}

#[inline]
pub const fn empty() -> Self {
Self { data: [ascii::Char::Null; N], alive: 0..0 }
Self { data: [MaybeUninit::uninit(); N], alive: 0..0 }
}

#[inline]
pub fn as_ascii(&self) -> &[ascii::Char] {
// SAFETY: `self.alive` is guaranteed to be a valid range for indexing `self.data`.
// SAFETY: the range indexed by `self.alive` is guaranteed to contain valid data.
unsafe {
self.data.get_unchecked(usize::from(self.alive.start)..usize::from(self.alive.end))
let data = self.data.get_unchecked(self.alive.start as usize..self.alive.end as usize);
Comment on lines -122 to +103
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you change usize::fromto as here? I prefer the from conversions

MaybeUninit::slice_assume_init_ref(data)
}
}

Expand All @@ -130,27 +112,31 @@ impl<const N: usize> EscapeIterInner<N> {

#[inline]
pub fn len(&self) -> usize {
usize::from(self.alive.end - self.alive.start)
self.alive.len()
}

#[inline]
pub fn next(&mut self) -> Option<u8> {
let i = self.alive.next()?;

// SAFETY: `i` is guaranteed to be a valid index for `self.data`.
unsafe { Some(self.data.get_unchecked(usize::from(i)).to_u8()) }
// SAFETY: the range indexed by `self.alive` is guaranteed to contain initialized data.
unsafe { Some(MaybeUninit::assume_init_ref(self.data.get_unchecked(i as usize)).to_u8()) }
}

#[inline]
pub fn next_back(&mut self) -> Option<u8> {
let i = self.alive.next_back()?;

// SAFETY: `i` is guaranteed to be a valid index for `self.data`.
unsafe { Some(self.data.get_unchecked(usize::from(i)).to_u8()) }
// SAFETY: the range indexed by `self.alive` is guaranteed to contain initialized data.
unsafe { Some(MaybeUninit::assume_init_ref(self.data.get_unchecked(i as usize)).to_u8()) }
}

#[inline]
pub fn advance_by(&mut self, n: usize) -> Result<(), NonZero<usize>> {
self.alive.advance_by(n)
}

#[inline]
pub fn advance_back_by(&mut self, n: usize) -> Result<(), NonZero<usize>> {
self.alive.advance_back_by(n)
}
Expand Down
Loading