From 2749be36f5402e1b0bf24f505bcc1d647fcbe2ff Mon Sep 17 00:00:00 2001
From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com>
Date: Wed, 17 Nov 2021 22:08:06 +1000
Subject: [PATCH 1/4] Add `DecodeUtf8` struct and implementation

---
 library/core/src/char/decode.rs | 287 +++++++++++++++++++++++++++++++-
 1 file changed, 286 insertions(+), 1 deletion(-)
diff --git a/library/core/src/char/decode.rs b/library/core/src/char/decode.rs
index 5dd8c5ef78941..c4b92ac93b35b 100644
--- a/library/core/src/char/decode.rs
+++ b/library/core/src/char/decode.rs
@@ -2,7 +2,292 @@
 
 use crate::fmt;
 
-use super::from_u32_unchecked;
+use super::{from_u32, from_u32_unchecked};
+
+/// An iterator that decodes UTF-8 encoded code points from an iterator of `u8`s.
+///
+/// This `struct` is created by the [`decode_utf8`] method on [`char`]. See its
+/// documentation for more.
+///
+/// [`decode_utf8`]: char::decode_utf8
+#[derive(Clone, Debug)]
+pub struct DecodeUtf8<I>
+where
+    I: Iterator<Item = u8>,
+{
+    iter: I,
+    buf: DecodeUtf8Buffer,
+}
+
+/// An error that can be returned when decoding UTF-8 code points.
+///
+/// This `struct` is created when using the [`DecodeUtf8`] type.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct DecodeUtf8Error {
+    code: u8,
+}
+
+/// Creates an iterator over the UTF-8 encoded code points in `iter`, returning
+/// invalid bytes as `Err`s.
+///
+/// # Examples
+///
+/// Basic usage:
+///
+/// ```
+/// use std::char::decode_utf8;
+///
+/// // 🦀the<invalid>crab<invalid>
+/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
+///
+/// assert_eq!(
+///     decode_utf8(v.iter().copied())
+///         .map(|r| r.map_err(|e| e.invalid_byte()))
+///         .collect::<Vec<_>>(),
+///     vec![
+///         Ok('🦀'),
+///         Ok('t'), Ok('h'), Ok('e'),
+///         Err(0xFF),
+///         Ok('c'), Ok('r'), Ok('a'), Ok('b'),
+///         Err(0x80),
+///     ]
+/// );
+/// ```
+///
+/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
+///
+/// ```
+/// use std::char::{decode_utf8, REPLACEMENT_CHARACTER};
+///
+/// // 🦀the<invalid>crab<invalid>
+/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
+///
+/// assert_eq!(
+///     decode_utf8(v.iter().copied())
+///        .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
+///        .collect::<String>(),
+///     "🦀the�crab�"
+/// );
+/// ```
+#[inline]
+pub fn decode_utf8<I: IntoIterator<Item = u8>>(iter: I) -> DecodeUtf8<I::IntoIter> {
+    DecodeUtf8 {
+        iter: iter.into_iter(),
+        buf: DecodeUtf8Buffer::Empty,
+    }
+}
+
+#[derive(Clone, Debug)]
+enum DecodeUtf8Buffer {
+    Empty,
+    One(u8),
+    Two(u8, u8),
+    Three(u8, u8, u8),
+}
+
+impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
+    type Item = Result<char, DecodeUtf8Error>;
+
+    fn next(&mut self) -> Option<Result<char, DecodeUtf8Error>> {
+        use DecodeUtf8Buffer::*;
+
+        macro_rules! valid_cont {
+            ($cont:expr) => {
+                (0b1000_0000..=0b1011_1111).contains(&$cont)
+            };
+        }
+
+        macro_rules! err {
+            ($c:expr) => {
+                return Some(Err(DecodeUtf8Error { code: $c }))
+            };
+        }
+
+        #[inline(always)]
+        fn from_utf8x2(c1: u8, c2: u8) -> char {
+            let c = (c2 as u32 & 0b0011_1111) + ((c1 as u32 & 0b0001_1111) << 6);
+            // SAFETY: the number is less than 0xd800
+            unsafe { from_u32_unchecked(c) }
+        }
+
+        #[inline(always)]
+        fn from_utf8x3(c1: u8, c2: u8, c3: u8) -> Option<char> {
+            from_u32(
+                (c3 as u32 & 0b0011_1111)
+                    + ((c2 as u32 & 0b0011_1111) << 6)
+                    + ((c1 as u32 & 0b0000_1111) << 12),
+            )
+        }
+
+        #[inline(always)]
+        fn from_utf8x4(c1: u8, c2: u8, c3: u8, c4: u8) -> Option<char> {
+            from_u32(
+                (c4 as u32 & 0b0011_1111)
+                    + ((c3 as u32 & 0b0011_1111) << 6)
+                    + ((c2 as u32 & 0b0011_1111) << 12)
+                    + ((c1 as u32 & 0b0000_0111) << 18),
+            )
+        }
+
+        loop {
+            match self.buf {
+                Empty | One(_) => {
+                    // Empty buffer: Test the next character for utf-8-ness
+                    let c = match self.buf {
+                        Empty => self.iter.next()?,
+                        One(c) => {
+                            self.buf = Empty;
+                            c
+                        }
+                        _ => unreachable!(),
+                    };
+                    match c {
+                        // ASCII
+                        0..=0x7f => return Some(Ok(c as char)),
+                        // Start byte
+                        0b1100_0010..=0b1101_1111
+                        | 0b1110_0000..=0b1110_1111
+                        | 0b1111_0000..=0b1111_0111 => {
+                            if let Some(cont) = self.iter.next() {
+                                self.buf = Two(c, cont); // push2
+                            } else {
+                                err!(c);
+                            }
+                        }
+                        // Continuation byte or Invalid byte
+                        _ => err!(c),
+                    }
+                }
+                Two(c1, c2) => {
+                    // in: 2
+                    // out: 0j, 1j, 3
+                    match c1 {
+                        // ASCII
+                        0..=0x7f => {
+                            self.buf = One(c2); // pop
+                            return Some(Ok(c1 as char));
+                        }
+                        // Start byte for 2
+                        0b1100_0010..=0b1101_1111 => {
+                            if valid_cont!(c2) {
+                                self.buf = Empty; // pop2
+                                return Some(Ok(from_utf8x2(c1, c2)));
+                            } else {
+                                self.buf = One(c2); // pop
+                                err!(c1);
+                            }
+                        }
+                        // Start byte for 3 or 4
+                        0b1110_0000..=0b1110_1111 | 0b1111_0000..=0b1111_0111 => {
+                            if let Some(cont) = self.iter.next() {
+                                self.buf = Three(c1, c2, cont); // push
+                            } else {
+                                self.buf = One(c2); // pop
+                                err!(c1);
+                            }
+                        }
+                        // Continuation byte or Invalid byte
+                        _ => {
+                            self.buf = One(c2);
+                            err!(c1);
+                        }
+                    }
+                }
+                Three(c1, c2, c3) => {
+                    // in: 3
+                    // out: 0j, 1j, 2j, 3j
+                    match c1 {
+                        // ASCII
+                        0..=0x7f => {
+                            self.buf = Two(c2, c3); // pop
+                            return Some(Ok(c1 as char));
+                        }
+                        // Start byte for 2
+                        0b1100_0010..=0b1101_1111 => {
+                            if valid_cont!(c2) {
+                                self.buf = One(c3); // pop2
+                                return Some(Ok(from_utf8x2(c1, c2)));
+                            } else {
+                                self.buf = Two(c2, c3); // pop
+                                err!(c1);
+                            }
+                        }
+                        // Start byte for 3
+                        0b1110_0000..=0b1110_1111 => {
+                            if valid_cont!(c2) && valid_cont!(c3) {
+                                match from_utf8x3(c1, c2, c3) {
+                                    Some(c) => {
+                                        self.buf = Empty; // pop3
+                                        return Some(Ok(c));
+                                    }
+                                    None => {
+                                        // It was in the invalid range
+                                        self.buf = Two(c2, c3); // pop
+                                        err!(c1);
+                                    }
+                                }
+                            } else {
+                                self.buf = Two(c2, c3); // pop
+                                err!(c1);
+                            }
+                        }
+                        // Start byte for 4
+                        0b1111_0000..=0b1111_0111 => {
+                            if let Some(c4) = self.iter.next() {
+                                // Handle inline
+                                if valid_cont!(c4) {
+                                    match from_utf8x4(c1, c2, c3, c4) {
+                                        Some(c) => {
+                                            self.buf = Empty; // pop3
+                                            return Some(Ok(c));
+                                        }
+                                        None => {
+                                            // It was in the invalid range
+                                            self.buf = Three(c2, c3, c4); // push/pop
+                                            err!(c1);
+                                        }
+                                    }
+                                } else {
+                                    self.buf = Three(c2, c3, c4); // push/pop
+                                    err!(c1);
+                                }
+                            } else {
+                                self.buf = Two(c2, c3); // pop
+                                err!(c1);
+                            }
+                        }
+                        // Continuation byte or Invalid byte
+                        _ => {
+                            self.buf = Two(c2, c3); // pop
+                            err!(c1);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let (low, high) = self.iter.size_hint();
+        // we could be entirely 4-byte characters or 1-byte characters
+        (low / 4, high)
+    }
+}
+
+impl DecodeUtf8Error {
+    /// Returns the invalid byte which caused this error.
+    #[must_use]
+    pub fn invalid_byte(&self) -> u8 {
+        self.code
+    }
+}
+
+impl fmt::Display for DecodeUtf8Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid byte found: {:x}", self.code)
+    }
+}
 
 /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
 ///

From 94c3ecf1eb0e85a25345c80bf1fe13bcf2f3c2ca Mon Sep 17 00:00:00 2001
From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com>
Date: Wed, 17 Nov 2021 22:10:51 +1000
Subject: [PATCH 2/4] Add instability for `DecodeUtf8`

---
 library/core/src/char/decode.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/library/core/src/char/decode.rs b/library/core/src/char/decode.rs
index c4b92ac93b35b..4aab0a29a456b 100644
--- a/library/core/src/char/decode.rs
+++ b/library/core/src/char/decode.rs
@@ -10,6 +10,7 @@ use super::{from_u32, from_u32_unchecked};
 /// documentation for more.
 ///
 /// [`decode_utf8`]: char::decode_utf8
+#[unstable(feature = "decode_utf8", issue = "none")]
 #[derive(Clone, Debug)]
 pub struct DecodeUtf8<I>
 where
@@ -22,6 +23,7 @@ where
 /// An error that can be returned when decoding UTF-8 code points.
 ///
 /// This `struct` is created when using the [`DecodeUtf8`] type.
+#[unstable(feature = "decode_utf8", issue = "none")]
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub struct DecodeUtf8Error {
     code: u8,
@@ -70,6 +72,7 @@ pub struct DecodeUtf8Error {
 /// );
 /// ```
 #[inline]
+#[unstable(feature = "decode_utf8", issue = "none")]
 pub fn decode_utf8<I: IntoIterator<Item = u8>>(iter: I) -> DecodeUtf8<I::IntoIter> {
     DecodeUtf8 {
         iter: iter.into_iter(),
@@ -85,6 +88,7 @@ enum DecodeUtf8Buffer {
     Three(u8, u8, u8),
 }
 
+#[unstable(feature = "decode_utf8", issue = "none")]
 impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
     type Item = Result<char, DecodeUtf8Error>;
 
@@ -278,11 +282,13 @@ impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
 impl DecodeUtf8Error {
     /// Returns the invalid byte which caused this error.
     #[must_use]
+    #[unstable(feature = "decode_utf8", issue = "none")]
     pub fn invalid_byte(&self) -> u8 {
         self.code
     }
 }
 
+#[unstable(feature = "decode_utf8", issue = "none")]
 impl fmt::Display for DecodeUtf8Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(f, "invalid byte found: {:x}", self.code)

From 5ad8e4d6ad41ed4ed7641d3a3d4d011ecc29aee7 Mon Sep 17 00:00:00 2001
From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com>
Date: Wed, 17 Nov 2021 22:12:17 +1000
Subject: [PATCH 3/4] Export `decode_utf8` from `core::char`

---
 library/core/src/char/mod.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs
index 0728523d0a413..145ba16da2948 100644
--- a/library/core/src/char/mod.rs
+++ b/library/core/src/char/mod.rs
@@ -36,6 +36,10 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
 #[stable(feature = "unicode_version", since = "1.45.0")]
 pub use crate::unicode::UNICODE_VERSION;
 
+// unstable re-exports
+#[unstable(feature = "decode_utf8", issue = "none")]
+pub use self::decode::{decode_utf8, DecodeUtf8, DecodeUtf8Error};
+
 // perma-unstable re-exports
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf16_raw;

From d6b1c503fb3ce0428bd4521a6a3d759de78564ec Mon Sep 17 00:00:00 2001
From: mbartlett21 <29034492+mbartlett21@users.noreply.github.com>
Date: Wed, 17 Nov 2021 22:13:57 +1000
Subject: [PATCH 4/4] Add `decode_utf8` method to `char`

---
 library/core/src/char/methods.rs | 48 ++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs
index 3c4972bd3c9a4..b97fe494ac8cd 100644
--- a/library/core/src/char/methods.rs
+++ b/library/core/src/char/methods.rs
@@ -41,6 +41,54 @@ impl char {
     #[stable(feature = "assoc_char_consts", since = "1.52.0")]
     pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION;
 
+    /// Creates an iterator over the UTF-8 encoded code points in `iter`, returning
+    /// invalid bytes as `Err`s.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// use std::char::decode_utf8;
+    ///
+    /// // 🦀the<invalid>crab<invalid>
+    /// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
+    ///
+    /// assert_eq!(
+    ///     decode_utf8(v.iter().copied())
+    ///         .map(|r| r.map_err(|e| e.invalid_byte()))
+    ///         .collect::<Vec<_>>(),
+    ///     vec![
+    ///         Ok('🦀'),
+    ///         Ok('t'), Ok('h'), Ok('e'),
+    ///         Err(0xFF),
+    ///         Ok('c'), Ok('r'), Ok('a'), Ok('b'),
+    ///         Err(0x80),
+    ///     ]
+    /// );
+    /// ```
+    ///
+    /// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
+    ///
+    /// ```
+    /// use std::char::{decode_utf8, REPLACEMENT_CHARACTER};
+    ///
+    /// // 🦀the<invalid>crab<invalid>
+    /// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
+    ///
+    /// assert_eq!(
+    ///     decode_utf8(v.iter().copied())
+    ///        .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
+    ///        .collect::<String>(),
+    ///     "🦀the�crab�"
+    /// );
+    /// ```
+    #[unstable(feature = "decode_utf8", issue = "none")]
+    #[inline]
+    pub fn decode_utf8<I: IntoIterator<Item = u8>>(iter: I) -> DecodeUtf8<I::IntoIter> {
+        super::decode::decode_utf8(iter)
+    }
+
     /// Creates an iterator over the UTF-16 encoded code points in `iter`,
     /// returning unpaired surrogates as `Err`s.
     ///