Rollup merge of #90607 - WaffleLapkin:const_str_from_utf8, r=oli-obk

JohnTitor · web-flow · commit b8ef644020e1 · 2021-11-18T23:11:12.000+09:00
Make slice-&gt;str conversion and related functions `const`

This PR marks the following APIs as `const`:
```rust
// core::str
pub const fn from_utf8(v: &amp;[u8]) -&gt; Result&lt;&amp;str, Utf8Error&gt;;
pub const fn from_utf8_mut(v: &amp;mut [u8]) -&gt; Result&lt;&amp;mut str, Utf8Error&gt;;
pub const unsafe fn from_utf8_unchecked_mut(v: &amp;mut [u8]) -&gt; &amp;mut str;

impl Utf8Error {
    pub const fn valid_up_to(&amp;self) -&gt; usize;
    pub const fn error_len(&amp;self) -&gt; Option&lt;usize&gt;;
}
```

Everything but `from_utf8_unchecked_mut` uses `const_str_from_utf8` feature gate, `from_utf8_unchecked_mut` uses `const_str_from_utf8_unchecked_mut` feature gate.

---

I'm not sure why `from_utf8_unchecked_mut` was left out being  non-`const`, considering that `from_utf8_unchecked` is not only `const`, but **`const` stable**.

---

r? `@oli-obk` (performance-only `const_eval_select` use)
diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs
@@ -25,6 +25,7 @@
 #![feature(const_btree_new)]
 #![feature(const_default_impls)]
 #![feature(const_trait_impl)]
+#![feature(const_str_from_utf8)]
 
 use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs
@@ -1,3 +1,4 @@
+use std::assert_matches::assert_matches;
 use std::borrow::Cow;
 use std::cmp::Ordering::{Equal, Greater, Less};
 use std::str::{from_utf8, from_utf8_unchecked};
@@ -883,6 +884,33 @@ fn test_is_utf8() {
     assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
 }
 
+#[test]
+fn test_const_is_utf8() {
+    const _: () = {
+        // deny overlong encodings
+        assert!(from_utf8(&[0xc0, 0x80]).is_err());
+        assert!(from_utf8(&[0xc0, 0xae]).is_err());
+        assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err());
+        assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
+        assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err());
+        assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
+        assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());
+
+        // deny surrogates
+        assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err());
+        assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err());
+
+        assert!(from_utf8(&[0xC2, 0x80]).is_ok());
+        assert!(from_utf8(&[0xDF, 0xBF]).is_ok());
+        assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
+        assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
+        assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
+        assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
+        assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
+        assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
+    };
+}
+
 #[test]
 fn from_utf8_mostly_ascii() {
     // deny invalid bytes embedded in long stretches of ascii
@@ -895,13 +923,43 @@ fn from_utf8_mostly_ascii() {
     }
 }
 
+#[test]
+fn const_from_utf8_mostly_ascii() {
+    const _: () = {
+        // deny invalid bytes embedded in long stretches of ascii
+        let mut i = 32;
+        while i < 64 {
+            let mut data = [0; 128];
+            data[i] = 0xC0;
+            assert!(from_utf8(&data).is_err());
+            data[i] = 0xC2;
+            assert!(from_utf8(&data).is_err());
+
+            i = i + 1;
+        }
+    };
+}
+
 #[test]
 fn from_utf8_error() {
     macro_rules! test {
-        ($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => {
+        ($input: expr, $expected_valid_up_to:pat, $expected_error_len:pat) => {
             let error = from_utf8($input).unwrap_err();
-            assert_eq!(error.valid_up_to(), $expected_valid_up_to);
-            assert_eq!(error.error_len(), $expected_error_len);
+            assert_matches!(error.valid_up_to(), $expected_valid_up_to);
+            assert_matches!(error.error_len(), $expected_error_len);
+
+            const _: () = {
+                match from_utf8($input) {
+                    Err(error) => {
+                        let valid_up_to = error.valid_up_to();
+                        let error_len = error.error_len();
+
+                        assert!(matches!(valid_up_to, $expected_valid_up_to));
+                        assert!(matches!(error_len, $expected_error_len));
+                    }
+                    Ok(_) => unreachable!(),
+                }
+            };
         };
     }
     test!(b"A\xC3\xA9 \xFF ", 4, Some(1));
diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
@@ -97,6 +97,7 @@
 #![allow(explicit_outlives_requirements)]
 //
 // Library features for const fns:
+#![feature(const_align_offset)]
 #![feature(const_align_of_val)]
 #![feature(const_alloc_layout)]
 #![feature(const_arguments_as_str)]
@@ -130,6 +131,7 @@
 #![feature(const_size_of_val)]
 #![feature(const_slice_from_raw_parts)]
 #![feature(const_slice_ptr_len)]
+#![feature(const_str_from_utf8_unchecked_mut)]
 #![feature(const_swap)]
 #![feature(const_trait_impl)]
 #![feature(const_type_id)]
@@ -138,6 +140,7 @@
 #![feature(duration_consts_2)]
 #![feature(ptr_metadata)]
 #![feature(slice_ptr_get)]
+#![feature(str_internals)]
 #![feature(variant_count)]
 #![feature(const_array_from_ref)]
 #![feature(const_slice_from_ref)]
diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs
@@ -82,10 +82,16 @@ use super::Utf8Error;
 /// assert_eq!("💖", sparkle_heart);
 /// ```
 #[stable(feature = "rust1", since = "1.0.0")]
-pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
-    run_utf8_validation(v)?;
-    // SAFETY: Just ran validation.
-    Ok(unsafe { from_utf8_unchecked(v) })
+#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
+pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
+    // This should use `?` again, once it's `const`
+    match run_utf8_validation(v) {
+        Ok(_) => {
+            // SAFETY: validation succeeded.
+            Ok(unsafe { from_utf8_unchecked(v) })
+        }
+        Err(err) => Err(err),
+    }
 }
 
 /// Converts a mutable slice of bytes to a mutable string slice.
@@ -119,10 +125,16 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
 /// See the docs for [`Utf8Error`] for more details on the kinds of
 /// errors that can be returned.
 #[stable(feature = "str_mut_extras", since = "1.20.0")]
-pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
-    run_utf8_validation(v)?;
-    // SAFETY: Just ran validation.
-    Ok(unsafe { from_utf8_unchecked_mut(v) })
+#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
+pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
+    // This should use `?` again, once it's `const`
+    match run_utf8_validation(v) {
+        Ok(_) => {
+            // SAFETY: validation succeeded.
+            Ok(unsafe { from_utf8_unchecked_mut(v) })
+        }
+        Err(err) => Err(err),
+    }
 }
 
 /// Converts a slice of bytes to a string slice without checking
@@ -184,7 +196,8 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
 #[inline]
 #[must_use]
 #[stable(feature = "str_mut_extras", since = "1.20.0")]
-pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
+#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "91005")]
+pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
     // SAFETY: the caller must guarantee that the bytes `v`
     // are valid UTF-8, thus the cast to `*mut str` is safe.
     // Also, the pointer dereference is safe because that pointer
diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs
@@ -72,9 +72,10 @@ impl Utf8Error {
     /// assert_eq!(1, error.valid_up_to());
     /// ```
     #[stable(feature = "utf8_error", since = "1.5.0")]
+    #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
     #[must_use]
     #[inline]
-    pub fn valid_up_to(&self) -> usize {
+    pub const fn valid_up_to(&self) -> usize {
         self.valid_up_to
     }
 
@@ -94,10 +95,15 @@ impl Utf8Error {
     ///
     /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html
     #[stable(feature = "utf8_error_error_len", since = "1.20.0")]
+    #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
     #[must_use]
     #[inline]
-    pub fn error_len(&self) -> Option<usize> {
-        self.error_len.map(|len| len as usize)
+    pub const fn error_len(&self) -> Option<usize> {
+        // This should become `map` again, once it's `const`
+        match self.error_len {
+            Some(len) => Some(len as usize),
+            None => None,
+        }
     }
 }
 
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -8,25 +8,25 @@ use super::Utf8Error;
 /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
 /// for width 3, and 3 bits for width 4.
 #[inline]
-fn utf8_first_byte(byte: u8, width: u32) -> u32 {
+const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
     (byte & (0x7F >> width)) as u32
 }
 
 /// Returns the value of `ch` updated with continuation byte `byte`.
 #[inline]
-fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
+const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
     (ch << 6) | (byte & CONT_MASK) as u32
 }
 
 /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
 /// bits `10`).
 #[inline]
-pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
+pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
     (byte as i8) < -64
 }
 
 #[inline]
-fn unwrap_or_0(opt: Option<&u8>) -> u8 {
+const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
     match opt {
         Some(&byte) => byte,
         None => 0,
@@ -105,14 +105,15 @@ const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
 
 /// Returns `true` if any byte in the word `x` is nonascii (>= 128).
 #[inline]
-fn contains_nonascii(x: usize) -> bool {
+const fn contains_nonascii(x: usize) -> bool {
     (x & NONASCII_MASK) != 0
 }
 
 /// Walks through `v` checking that it's a valid UTF-8 sequence,
 /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
 #[inline(always)]
-pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
+#[rustc_const_unstable(feature = "str_internals", issue = "none")]
+pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
     let mut index = 0;
     let len = v.len();
 
@@ -142,7 +143,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
 
         let first = v[index];
         if first >= 128 {
-            let w = UTF8_CHAR_WIDTH[first as usize];
+            let w = utf8_char_width(first);
             // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
             //        first  C2 80        last DF BF
             // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
@@ -230,7 +231,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
 }
 
 // https://tools.ietf.org/html/rfc3629
-static UTF8_CHAR_WIDTH: [u8; 256] = [
+const UTF8_CHAR_WIDTH: &[u8; 256] = &[
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, // 0x1F
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -253,7 +254,7 @@ static UTF8_CHAR_WIDTH: [u8; 256] = [
 #[unstable(feature = "str_internals", issue = "none")]
 #[must_use]
 #[inline]
-pub fn utf8_char_width(b: u8) -> usize {
+pub const fn utf8_char_width(b: u8) -> usize {
     UTF8_CHAR_WIDTH[b as usize] as usize
 }