From 2749be36f5402e1b0bf24f505bcc1d647fcbe2ff Mon Sep 17 00:00:00 2001 From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com> Date: Wed, 17 Nov 2021 22:08:06 +1000 Subject: [PATCH 1/4] Add `DecodeUtf8` struct and implementation --- library/core/src/char/decode.rs | 287 +++++++++++++++++++++++++++++++- 1 file changed, 286 insertions(+), 1 deletion(-) diff --git a/library/core/src/char/decode.rs b/library/core/src/char/decode.rs index 5dd8c5ef78941..c4b92ac93b35b 100644 --- a/library/core/src/char/decode.rs +++ b/library/core/src/char/decode.rs @@ -2,7 +2,292 @@ use crate::fmt; -use super::from_u32_unchecked; +use super::{from_u32, from_u32_unchecked}; + +/// An iterator that decodes UTF-8 encoded code points from an iterator of `u8`s. +/// +/// This `struct` is created by the [`decode_utf8`] method on [`char`]. See its +/// documentation for more. +/// +/// [`decode_utf8`]: char::decode_utf8 +#[derive(Clone, Debug)] +pub struct DecodeUtf8 +where + I: Iterator, +{ + iter: I, + buf: DecodeUtf8Buffer, +} + +/// An error that can be returned when decoding UTF-8 code points. +/// +/// This `struct` is created when using the [`DecodeUtf8`] type. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct DecodeUtf8Error { + code: u8, +} + +/// Creates an iterator over the UTF-8 encoded code points in `iter`, returning +/// invalid bytes as `Err`s. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::char::decode_utf8; +/// +/// // 🦀thecrab +/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80"; +/// +/// assert_eq!( +/// decode_utf8(v.iter().copied()) +/// .map(|r| r.map_err(|e| e.invalid_byte())) +/// .collect::>(), +/// vec![ +/// Ok('🦀'), +/// Ok('t'), Ok('h'), Ok('e'), +/// Err(0xFF), +/// Ok('c'), Ok('r'), Ok('a'), Ok('b'), +/// Err(0x80), +/// ] +/// ); +/// ``` +/// +/// A lossy decoder can be obtained by replacing `Err` results with the replacement character: +/// +/// ``` +/// use std::char::{decode_utf8, REPLACEMENT_CHARACTER}; +/// +/// // 🦀thecrab +/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80"; +/// +/// assert_eq!( +/// decode_utf8(v.iter().copied()) +/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) +/// .collect::(), +/// "🦀the�crab�" +/// ); +/// ``` +#[inline] +pub fn decode_utf8>(iter: I) -> DecodeUtf8 { + DecodeUtf8 { + iter: iter.into_iter(), + buf: DecodeUtf8Buffer::Empty, + } +} + +#[derive(Clone, Debug)] +enum DecodeUtf8Buffer { + Empty, + One(u8), + Two(u8, u8), + Three(u8, u8, u8), +} + +impl> Iterator for DecodeUtf8 { + type Item = Result; + + fn next(&mut self) -> Option> { + use DecodeUtf8Buffer::*; + + macro_rules! valid_cont { + ($cont:expr) => { + (0b1000_0000..=0b1011_1111).contains(&$cont) + }; + } + + macro_rules! err { + ($c:expr) => { + return Some(Err(DecodeUtf8Error { code: $c })) + }; + } + + #[inline(always)] + fn from_utf8x2(c1: u8, c2: u8) -> char { + let c = (c2 as u32 & 0b0011_1111) + ((c1 as u32 & 0b0001_1111) << 6); + // SAFETY: the number is less than 0xd800 + unsafe { from_u32_unchecked(c) } + } + + #[inline(always)] + fn from_utf8x3(c1: u8, c2: u8, c3: u8) -> Option { + from_u32( + (c3 as u32 & 0b0011_1111) + + ((c2 as u32 & 0b0011_1111) << 6) + + ((c1 as u32 & 0b0000_1111) << 12), + ) + } + + #[inline(always)] + fn from_utf8x4(c1: u8, c2: u8, c3: u8, c4: u8) -> Option { + from_u32( + (c4 as u32 & 0b0011_1111) + + ((c3 as u32 & 0b0011_1111) << 6) + + ((c2 as u32 & 0b0011_1111) << 12) + + ((c1 as u32 & 0b0000_0111) << 18), + ) + } + + loop { + match self.buf { + Empty | One(_) => { + // Empty buffer: Test the next character for utf-8-ness + let c = match self.buf { + Empty => self.iter.next()?, + One(c) => { + self.buf = Empty; + c + } + _ => unreachable!(), + }; + match c { + // ASCII + 0..=0x7f => return Some(Ok(c as char)), + // Start byte + 0b1100_0010..=0b1101_1111 + | 0b1110_0000..=0b1110_1111 + | 0b1111_0000..=0b1111_0111 => { + if let Some(cont) = self.iter.next() { + self.buf = Two(c, cont); // push2 + } else { + err!(c); + } + } + // Continuation byte or Invalid byte + _ => err!(c), + } + } + Two(c1, c2) => { + // in: 2 + // out: 0j, 1j, 3 + match c1 { + // ASCII + 0..=0x7f => { + self.buf = One(c2); // pop + return Some(Ok(c1 as char)); + } + // Start byte for 2 + 0b1100_0010..=0b1101_1111 => { + if valid_cont!(c2) { + self.buf = Empty; // pop2 + return Some(Ok(from_utf8x2(c1, c2))); + } else { + self.buf = One(c2); // pop + err!(c1); + } + } + // Start byte for 3 or 4 + 0b1110_0000..=0b1110_1111 | 0b1111_0000..=0b1111_0111 => { + if let Some(cont) = self.iter.next() { + self.buf = Three(c1, c2, cont); // push + } else { + self.buf = One(c2); // pop + err!(c1); + } + } + // Continuation byte or Invalid byte + _ => { + self.buf = One(c2); + err!(c1); + } + } + } + Three(c1, c2, c3) => { + // in: 3 + // out: 0j, 1j, 2j, 3j + match c1 { + // ASCII + 0..=0x7f => { + self.buf = Two(c2, c3); // pop + return Some(Ok(c1 as char)); + } + // Start byte for 2 + 0b1100_0010..=0b1101_1111 => { + if valid_cont!(c2) { + self.buf = One(c3); // pop2 + return Some(Ok(from_utf8x2(c1, c2))); + } else { + self.buf = Two(c2, c3); // pop + err!(c1); + } + } + // Start byte for 3 + 0b1110_0000..=0b1110_1111 => { + if valid_cont!(c2) && valid_cont!(c3) { + match from_utf8x3(c1, c2, c3) { + Some(c) => { + self.buf = Empty; // pop3 + return Some(Ok(c)); + } + None => { + // It was in the invalid range + self.buf = Two(c2, c3); // pop + err!(c1); + } + } + } else { + self.buf = Two(c2, c3); // pop + err!(c1); + } + } + // Start byte for 4 + 0b1111_0000..=0b1111_0111 => { + if let Some(c4) = self.iter.next() { + // Handle inline + if valid_cont!(c4) { + match from_utf8x4(c1, c2, c3, c4) { + Some(c) => { + self.buf = Empty; // pop3 + return Some(Ok(c)); + } + None => { + // It was in the invalid range + self.buf = Three(c2, c3, c4); // push/pop + err!(c1); + } + } + } else { + self.buf = Three(c2, c3, c4); // push/pop + err!(c1); + } + } else { + self.buf = Two(c2, c3); // pop + err!(c1); + } + } + // Continuation byte or Invalid byte + _ => { + self.buf = Two(c2, c3); // pop + err!(c1); + } + } + } + } + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely 4-byte characters or 1-byte characters + (low / 4, high) + } +} + +impl DecodeUtf8Error { + /// Returns the invalid byte which caused this error. + #[must_use] + pub fn invalid_byte(&self) -> u8 { + self.code + } +} + +impl fmt::Display for DecodeUtf8Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid byte found: {:x}", self.code) + } +} /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. /// From 94c3ecf1eb0e85a25345c80bf1fe13bcf2f3c2ca Mon Sep 17 00:00:00 2001 From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com> Date: Wed, 17 Nov 2021 22:10:51 +1000 Subject: [PATCH 2/4] Add instability for `DecodeUtf8` --- library/core/src/char/decode.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/library/core/src/char/decode.rs b/library/core/src/char/decode.rs index c4b92ac93b35b..4aab0a29a456b 100644 --- a/library/core/src/char/decode.rs +++ b/library/core/src/char/decode.rs @@ -10,6 +10,7 @@ use super::{from_u32, from_u32_unchecked}; /// documentation for more. /// /// [`decode_utf8`]: char::decode_utf8 +#[unstable(feature = "decode_utf8", issue = "none")] #[derive(Clone, Debug)] pub struct DecodeUtf8 where @@ -22,6 +23,7 @@ where /// An error that can be returned when decoding UTF-8 code points. /// /// This `struct` is created when using the [`DecodeUtf8`] type. +#[unstable(feature = "decode_utf8", issue = "none")] #[derive(Debug, Clone, Eq, PartialEq)] pub struct DecodeUtf8Error { code: u8, @@ -70,6 +72,7 @@ pub struct DecodeUtf8Error { /// ); /// ``` #[inline] +#[unstable(feature = "decode_utf8", issue = "none")] pub fn decode_utf8>(iter: I) -> DecodeUtf8 { DecodeUtf8 { iter: iter.into_iter(), @@ -85,6 +88,7 @@ enum DecodeUtf8Buffer { Three(u8, u8, u8), } +#[unstable(feature = "decode_utf8", issue = "none")] impl> Iterator for DecodeUtf8 { type Item = Result; @@ -278,11 +282,13 @@ impl> Iterator for DecodeUtf8 { impl DecodeUtf8Error { /// Returns the invalid byte which caused this error. #[must_use] + #[unstable(feature = "decode_utf8", issue = "none")] pub fn invalid_byte(&self) -> u8 { self.code } } +#[unstable(feature = "decode_utf8", issue = "none")] impl fmt::Display for DecodeUtf8Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "invalid byte found: {:x}", self.code) From 5ad8e4d6ad41ed4ed7641d3a3d4d011ecc29aee7 Mon Sep 17 00:00:00 2001 From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com> Date: Wed, 17 Nov 2021 22:12:17 +1000 Subject: [PATCH 3/4] Export `decode_utf8` from `core::char` --- library/core/src/char/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 0728523d0a413..145ba16da2948 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -36,6 +36,10 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error}; #[stable(feature = "unicode_version", since = "1.45.0")] pub use crate::unicode::UNICODE_VERSION; +// unstable re-exports +#[unstable(feature = "decode_utf8", issue = "none")] +pub use self::decode::{decode_utf8, DecodeUtf8, DecodeUtf8Error}; + // perma-unstable re-exports #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] pub use self::methods::encode_utf16_raw; From d6b1c503fb3ce0428bd4521a6a3d759de78564ec Mon Sep 17 00:00:00 2001 From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com> Date: Wed, 17 Nov 2021 22:13:57 +1000 Subject: [PATCH 4/4] Add `decode_utf8` method to `char` --- library/core/src/char/methods.rs | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 3c4972bd3c9a4..b97fe494ac8cd 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -41,6 +41,54 @@ impl char { #[stable(feature = "assoc_char_consts", since = "1.52.0")] pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION; + /// Creates an iterator over the UTF-8 encoded code points in `iter`, returning + /// invalid bytes as `Err`s. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::char::decode_utf8; + /// + /// // 🦀thecrab + /// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80"; + /// + /// assert_eq!( + /// decode_utf8(v.iter().copied()) + /// .map(|r| r.map_err(|e| e.invalid_byte())) + /// .collect::>(), + /// vec![ + /// Ok('🦀'), + /// Ok('t'), Ok('h'), Ok('e'), + /// Err(0xFF), + /// Ok('c'), Ok('r'), Ok('a'), Ok('b'), + /// Err(0x80), + /// ] + /// ); + /// ``` + /// + /// A lossy decoder can be obtained by replacing `Err` results with the replacement character: + /// + /// ``` + /// use std::char::{decode_utf8, REPLACEMENT_CHARACTER}; + /// + /// // 🦀thecrab + /// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80"; + /// + /// assert_eq!( + /// decode_utf8(v.iter().copied()) + /// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) + /// .collect::(), + /// "🦀the�crab�" + /// ); + /// ``` + #[unstable(feature = "decode_utf8", issue = "none")] + #[inline] + pub fn decode_utf8>(iter: I) -> DecodeUtf8 { + super::decode::decode_utf8(iter) + } + /// Creates an iterator over the UTF-16 encoded code points in `iter`, /// returning unpaired surrogates as `Err`s. ///