Skip to content

Add a way to decode utf-8 while handling errors #90980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 292 additions & 1 deletion library/core/src/char/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,298 @@

use crate::fmt;

use super::from_u32_unchecked;
use super::{from_u32, from_u32_unchecked};

/// An iterator that decodes UTF-8 encoded code points from an iterator of `u8`s.
///
/// This `struct` is created by the [`decode_utf8`] method on [`char`]. See its
/// documentation for more.
///
/// [`decode_utf8`]: char::decode_utf8
#[unstable(feature = "decode_utf8", issue = "none")]
#[derive(Clone, Debug)]
pub struct DecodeUtf8<I>
where
I: Iterator<Item = u8>,
{
iter: I,
buf: DecodeUtf8Buffer,
}

/// An error that can be returned when decoding UTF-8 code points.
///
/// This `struct` is created when using the [`DecodeUtf8`] type.
#[unstable(feature = "decode_utf8", issue = "none")]
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct DecodeUtf8Error {
code: u8,
}

/// Creates an iterator over the UTF-8 encoded code points in `iter`, returning
/// invalid bytes as `Err`s.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::char::decode_utf8;
///
/// // 🦀the<invalid>crab<invalid>
/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
///
/// assert_eq!(
/// decode_utf8(v.iter().copied())
/// .map(|r| r.map_err(|e| e.invalid_byte()))
/// .collect::<Vec<_>>(),
/// vec![
/// Ok('🦀'),
/// Ok('t'), Ok('h'), Ok('e'),
/// Err(0xFF),
/// Ok('c'), Ok('r'), Ok('a'), Ok('b'),
/// Err(0x80),
/// ]
/// );
/// ```
///
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
///
/// ```
/// use std::char::{decode_utf8, REPLACEMENT_CHARACTER};
///
/// // 🦀the<invalid>crab<invalid>
/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
///
/// assert_eq!(
/// decode_utf8(v.iter().copied())
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
/// .collect::<String>(),
/// "🦀the�crab�"
/// );
/// ```
#[inline]
#[unstable(feature = "decode_utf8", issue = "none")]
pub fn decode_utf8<I: IntoIterator<Item = u8>>(iter: I) -> DecodeUtf8<I::IntoIter> {
DecodeUtf8 {
iter: iter.into_iter(),
buf: DecodeUtf8Buffer::Empty,
}
}

#[derive(Clone, Debug)]
enum DecodeUtf8Buffer {
Empty,
One(u8),
Two(u8, u8),
Three(u8, u8, u8),
}

#[unstable(feature = "decode_utf8", issue = "none")]
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
type Item = Result<char, DecodeUtf8Error>;

fn next(&mut self) -> Option<Result<char, DecodeUtf8Error>> {
use DecodeUtf8Buffer::*;

macro_rules! valid_cont {
($cont:expr) => {
(0b1000_0000..=0b1011_1111).contains(&$cont)
};
}

macro_rules! err {
($c:expr) => {
return Some(Err(DecodeUtf8Error { code: $c }))
};
}

#[inline(always)]
fn from_utf8x2(c1: u8, c2: u8) -> char {
let c = (c2 as u32 & 0b0011_1111) + ((c1 as u32 & 0b0001_1111) << 6);
// SAFETY: the number is less than 0xd800
unsafe { from_u32_unchecked(c) }
}

#[inline(always)]
fn from_utf8x3(c1: u8, c2: u8, c3: u8) -> Option<char> {
from_u32(
(c3 as u32 & 0b0011_1111)
+ ((c2 as u32 & 0b0011_1111) << 6)
+ ((c1 as u32 & 0b0000_1111) << 12),
)
}

#[inline(always)]
fn from_utf8x4(c1: u8, c2: u8, c3: u8, c4: u8) -> Option<char> {
from_u32(
(c4 as u32 & 0b0011_1111)
+ ((c3 as u32 & 0b0011_1111) << 6)
+ ((c2 as u32 & 0b0011_1111) << 12)
+ ((c1 as u32 & 0b0000_0111) << 18),
)
}

loop {
match self.buf {
Empty | One(_) => {
// Empty buffer: Test the next character for utf-8-ness
let c = match self.buf {
Empty => self.iter.next()?,
One(c) => {
self.buf = Empty;
c
}
_ => unreachable!(),
};
match c {
// ASCII
0..=0x7f => return Some(Ok(c as char)),
// Start byte
0b1100_0010..=0b1101_1111
| 0b1110_0000..=0b1110_1111
| 0b1111_0000..=0b1111_0111 => {
if let Some(cont) = self.iter.next() {
self.buf = Two(c, cont); // push2
} else {
err!(c);
}
}
// Continuation byte or Invalid byte
_ => err!(c),
}
}
Two(c1, c2) => {
// in: 2
// out: 0j, 1j, 3
match c1 {
// ASCII
0..=0x7f => {
self.buf = One(c2); // pop
return Some(Ok(c1 as char));
}
// Start byte for 2
0b1100_0010..=0b1101_1111 => {
if valid_cont!(c2) {
self.buf = Empty; // pop2
return Some(Ok(from_utf8x2(c1, c2)));
} else {
self.buf = One(c2); // pop
err!(c1);
}
}
// Start byte for 3 or 4
0b1110_0000..=0b1110_1111 | 0b1111_0000..=0b1111_0111 => {
if let Some(cont) = self.iter.next() {
self.buf = Three(c1, c2, cont); // push
} else {
self.buf = One(c2); // pop
err!(c1);
}
}
// Continuation byte or Invalid byte
_ => {
self.buf = One(c2);
err!(c1);
}
}
}
Three(c1, c2, c3) => {
// in: 3
// out: 0j, 1j, 2j, 3j
match c1 {
// ASCII
0..=0x7f => {
self.buf = Two(c2, c3); // pop
return Some(Ok(c1 as char));
}
// Start byte for 2
0b1100_0010..=0b1101_1111 => {
if valid_cont!(c2) {
self.buf = One(c3); // pop2
return Some(Ok(from_utf8x2(c1, c2)));
} else {
self.buf = Two(c2, c3); // pop
err!(c1);
}
}
// Start byte for 3
0b1110_0000..=0b1110_1111 => {
if valid_cont!(c2) && valid_cont!(c3) {
match from_utf8x3(c1, c2, c3) {
Some(c) => {
self.buf = Empty; // pop3
return Some(Ok(c));
}
None => {
// It was in the invalid range
self.buf = Two(c2, c3); // pop
err!(c1);
}
}
} else {
self.buf = Two(c2, c3); // pop
err!(c1);
}
}
// Start byte for 4
0b1111_0000..=0b1111_0111 => {
if let Some(c4) = self.iter.next() {
// Handle inline
if valid_cont!(c4) {
match from_utf8x4(c1, c2, c3, c4) {
Some(c) => {
self.buf = Empty; // pop3
return Some(Ok(c));
}
None => {
// It was in the invalid range
self.buf = Three(c2, c3, c4); // push/pop
err!(c1);
}
}
} else {
self.buf = Three(c2, c3, c4); // push/pop
err!(c1);
}
} else {
self.buf = Two(c2, c3); // pop
err!(c1);
}
}
// Continuation byte or Invalid byte
_ => {
self.buf = Two(c2, c3); // pop
err!(c1);
}
}
}
}
}
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.iter.size_hint();
// we could be entirely 4-byte characters or 1-byte characters
(low / 4, high)
}
}

impl DecodeUtf8Error {
/// Returns the invalid byte which caused this error.
#[must_use]
#[unstable(feature = "decode_utf8", issue = "none")]
pub fn invalid_byte(&self) -> u8 {
self.code
}
}

#[unstable(feature = "decode_utf8", issue = "none")]
impl fmt::Display for DecodeUtf8Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "invalid byte found: {:x}", self.code)
}
}

/// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
///
Expand Down
48 changes: 48 additions & 0 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,54 @@ impl char {
#[stable(feature = "assoc_char_consts", since = "1.52.0")]
pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION;

/// Creates an iterator over the UTF-8 encoded code points in `iter`, returning
/// invalid bytes as `Err`s.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::char::decode_utf8;
///
/// // 🦀the<invalid>crab<invalid>
/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
///
/// assert_eq!(
/// decode_utf8(v.iter().copied())
/// .map(|r| r.map_err(|e| e.invalid_byte()))
/// .collect::<Vec<_>>(),
/// vec![
/// Ok('🦀'),
/// Ok('t'), Ok('h'), Ok('e'),
/// Err(0xFF),
/// Ok('c'), Ok('r'), Ok('a'), Ok('b'),
/// Err(0x80),
/// ]
/// );
/// ```
///
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
///
/// ```
/// use std::char::{decode_utf8, REPLACEMENT_CHARACTER};
///
/// // 🦀the<invalid>crab<invalid>
/// let v = b"\xf0\x9f\xa6\x80the\xFFcrab\x80";
///
/// assert_eq!(
/// decode_utf8(v.iter().copied())
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
/// .collect::<String>(),
/// "🦀the�crab�"
/// );
/// ```
#[unstable(feature = "decode_utf8", issue = "none")]
#[inline]
pub fn decode_utf8<I: IntoIterator<Item = u8>>(iter: I) -> DecodeUtf8<I::IntoIter> {
super::decode::decode_utf8(iter)
}

/// Creates an iterator over the UTF-16 encoded code points in `iter`,
/// returning unpaired surrogates as `Err`s.
///
Expand Down
4 changes: 4 additions & 0 deletions library/core/src/char/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error};
#[stable(feature = "unicode_version", since = "1.45.0")]
pub use crate::unicode::UNICODE_VERSION;

// unstable re-exports
#[unstable(feature = "decode_utf8", issue = "none")]
pub use self::decode::{decode_utf8, DecodeUtf8, DecodeUtf8Error};

// perma-unstable re-exports
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
pub use self::methods::encode_utf16_raw;
Expand Down