@@ -1806,39 +1806,71 @@ const fn len_utf16(code: u32) -> usize {
18061806#[ inline]
18071807pub const fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
18081808 let len = len_utf8 ( code) ;
1809- match ( len, & mut * dst) {
1810- ( 1 , [ a, ..] ) => {
1811- * a = code as u8 ;
1812- }
1813- ( 2 , [ a, b, ..] ) => {
1814- * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1815- * b = ( code & 0x3F ) as u8 | TAG_CONT ;
1816- }
1817- ( 3 , [ a, b, c, ..] ) => {
1818- * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1819- * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1820- * c = ( code & 0x3F ) as u8 | TAG_CONT ;
1821- }
1822- ( 4 , [ a, b, c, d, ..] ) => {
1823- * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1824- * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1825- * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1826- * d = ( code & 0x3F ) as u8 | TAG_CONT ;
1827- }
1828- _ => {
1829- const_panic ! (
1830- "encode_utf8: buffer does not have enough bytes to encode code point" ,
1831- "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}" ,
1832- code: u32 = code,
1833- len: usize = len,
1834- dst_len: usize = dst. len( ) ,
1835- )
1836- }
1837- } ;
1809+ if dst. len ( ) < len {
1810+ const_panic ! (
1811+ "encode_utf8: buffer does not have enough bytes to encode code point" ,
1812+ "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}" ,
1813+ code: u32 = code,
1814+ len: usize = len,
1815+ dst_len: usize = dst. len( ) ,
1816+ ) ;
1817+ }
1818+
1819+ // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
1820+ unsafe { encode_utf8_raw_unchecked ( code, dst. as_mut_ptr ( ) ) } ;
1821+
18381822 // SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
18391823 unsafe { slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , len) }
18401824}
18411825
1826+ /// Encodes a raw `u32` value as UTF-8 into the byte buffer pointed to by `dst`.
1827+ ///
1828+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1829+ /// (Creating a `char` in the surrogate range is UB.)
1830+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1831+ ///
1832+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1833+ ///
1834+ /// # Safety
1835+ ///
1836+ /// The behavior is undefined if the buffer pointed to by `dst` is not
1837+ /// large enough to hold the encoded codepoint. A buffer of length four
1838+ /// is large enough to encode any `char`.
1839+ ///
1840+ /// For a safe version of this function, see the [`encode_utf8_raw`] function.
1841+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1842+ #[ doc( hidden) ]
1843+ #[ inline]
1844+ pub const unsafe fn encode_utf8_raw_unchecked ( code : u32 , dst : * mut u8 ) {
1845+ let len = len_utf8 ( code) ;
1846+ // SAFETY: The caller must guarantee that the buffer pointed to by `dst`
1847+ // is at least `len` bytes long.
1848+ unsafe {
1849+ match len {
1850+ 1 => {
1851+ * dst = code as u8 ;
1852+ }
1853+ 2 => {
1854+ * dst = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1855+ * dst. add ( 1 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1856+ }
1857+ 3 => {
1858+ * dst = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1859+ * dst. add ( 1 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1860+ * dst. add ( 2 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1861+ }
1862+ 4 => {
1863+ * dst = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1864+ * dst. add ( 1 ) = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1865+ * dst. add ( 2 ) = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1866+ * dst. add ( 3 ) = ( code & 0x3F ) as u8 | TAG_CONT ;
1867+ }
1868+ // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
1869+ _ => crate :: hint:: unreachable_unchecked ( ) ,
1870+ }
1871+ }
1872+ }
1873+
18421874/// Encodes a raw `u32` value as native endian UTF-16 into the provided `u16` buffer,
18431875/// and then returns the subslice of the buffer that contains the encoded character.
18441876///
0 commit comments