-
-
Notifications
You must be signed in to change notification settings - Fork 497
Recognize supplementary characters #913
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,12 @@ | ||
| // Copyright (c) Alexandre Mutel. All rights reserved. | ||
| // This file is licensed under the BSD-Clause 2 license. | ||
| // This file is licensed under the BSD-Clause 2 license. | ||
| // See the license.txt file in the project root for more information. | ||
|
|
||
| using System.Buffers; | ||
| using System.Diagnostics; | ||
| using System.Globalization; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Text; | ||
|
|
||
| namespace Markdig.Helpers; | ||
|
|
||
|
|
@@ -69,10 +70,10 @@ public static class CharHelper | |
| private static readonly SearchValues<char> s_escapableSymbolChars = SearchValues.Create("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~•"); | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static bool IsPunctuationException(char c) => | ||
| c is '−' or '-' or '†' or '‡'; | ||
| private static bool IsPunctuationException(Rune c) => | ||
| c.IsBmp && (char)c.Value is '−' or '-' or '†' or '‡'; | ||
|
|
||
| public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWord, out bool canOpen, out bool canClose) | ||
| public static void CheckOpenCloseDelimiter(Rune pc, Rune c, bool enableWithinWord, out bool canOpen, out bool canClose) | ||
| { | ||
| pc.CheckUnicodeCategory(out bool prevIsWhiteSpace, out bool prevIsPunctuation); | ||
| c.CheckUnicodeCategory(out bool nextIsWhiteSpace, out bool nextIsPunctuation); | ||
|
|
@@ -100,13 +101,13 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor | |
| if (!enableWithinWord) | ||
| { | ||
| var temp = canOpen; | ||
| // A single _ character can open emphasis iff it is part of a left-flanking delimiter run and either | ||
| // (a) not part of a right-flanking delimiter run or | ||
| // A single _ character can open emphasis iff it is part of a left-flanking delimiter run and either | ||
| // (a) not part of a right-flanking delimiter run or | ||
| // (b) part of a right-flanking delimiter run preceded by punctuation. | ||
| canOpen = canOpen && (!canClose || prevIsPunctuation); | ||
|
|
||
| // A single _ character can close emphasis iff it is part of a right-flanking delimiter run and either | ||
| // (a) not part of a left-flanking delimiter run or | ||
| // (a) not part of a left-flanking delimiter run or | ||
| // (b) part of a left-flanking delimiter run followed by punctuation. | ||
| canClose = canClose && (!temp || nextIsPunctuation); | ||
| } | ||
|
|
@@ -199,6 +200,9 @@ public static bool IsWhitespace(this char c) | |
| return IsWhitespaceRare(c); | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public static bool IsWhitespace(this Rune r) => r.IsBmp && IsWhitespace((char)r.Value); | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public static bool IsWhiteSpaceOrZero(this char c) | ||
| { | ||
|
|
@@ -263,6 +267,52 @@ public static void CheckUnicodeCategory(this char c, out bool space, out bool pu | |
| } | ||
| } | ||
|
|
||
| #if !(NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER) | ||
| private static Lazy<Func<int, UnicodeCategory>?> GetUnicodeCategoryReflection = | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should fallback to |
||
| new(() => (Func<int, UnicodeCategory>?)typeof(char).GetMethod("GetUnicodeCategory", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static)?.CreateDelegate( | ||
| typeof(Func<int, UnicodeCategory>))); | ||
| #endif | ||
|
|
||
| // Check if a char is a space or a punctuation | ||
| public static void CheckUnicodeCategory(this Rune c, out bool space, out bool punctuation) | ||
| { | ||
| if (IsWhitespace(c)) | ||
| { | ||
| space = true; | ||
| punctuation = false; | ||
| } | ||
| else if (c.Value <= 127) | ||
| { | ||
| space = c.Value == 0; | ||
| punctuation = c.IsBmp && IsAsciiPunctuationOrZero((char)c.Value); | ||
| } | ||
| else | ||
| { | ||
| space = false; | ||
| punctuation = (CommonMarkPunctuationCategoryMask & (1 << | ||
| #if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER | ||
| (int)CharUnicodeInfo.GetUnicodeCategory(c.Value) | ||
| #else | ||
| (int)GetUnicodeCategoryFallback(c) | ||
| #endif | ||
| )) != 0; | ||
| } | ||
|
|
||
| #if !(NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER) | ||
| static UnicodeCategory GetUnicodeCategoryFallback(Rune c) | ||
| { | ||
| if (c.IsBmp) return CharUnicodeInfo.GetUnicodeCategory((char)c.Value); | ||
|
|
||
| if (GetUnicodeCategoryReflection.Value is Func<int, UnicodeCategory> GetUnicodeCategory) | ||
| { | ||
| return GetUnicodeCategory(c.Value); | ||
| } | ||
|
|
||
| return CharUnicodeInfo.GetUnicodeCategory(c.ToString(), 0); | ||
| } | ||
| #endif | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static bool IsSpaceOrPunctuationForGFMAutoLink(char c) | ||
| { | ||
|
|
@@ -309,15 +359,15 @@ public static bool IsZero(this char c) | |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public static bool IsSpace(this char c) | ||
| { | ||
| // 2.1 Characters and lines | ||
| // 2.1 Characters and lines | ||
| // A space is U+0020. | ||
| return c == ' '; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public static bool IsTab(this char c) | ||
| { | ||
| // 2.1 Characters and lines | ||
| // 2.1 Characters and lines | ||
| // A space is U+0009. | ||
| return c == '\t'; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,12 @@ | ||
| // Copyright (c) Alexandre Mutel. All rights reserved. | ||
| // This file is licensed under the BSD-Clause 2 license. | ||
| // This file is licensed under the BSD-Clause 2 license. | ||
| // See the license.txt file in the project root for more information. | ||
|
|
||
| #nullable disable | ||
|
|
||
| using System.Runtime.CompilerServices; | ||
| using System.Runtime.InteropServices; | ||
| using System.Text; | ||
|
|
||
| namespace Markdig.Helpers; | ||
|
|
||
|
|
@@ -114,7 +115,7 @@ internal StringSlice(string text, int start, int end, NewLine newLine, bool dumm | |
| public NewLine NewLine; | ||
|
|
||
| /// <summary> | ||
| /// Gets the current character. | ||
| /// Gets the current character . | ||
| /// </summary> | ||
| public readonly char CurrentChar | ||
| { | ||
|
|
@@ -125,6 +126,31 @@ public readonly char CurrentChar | |
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Gets the current rune (Unicode scalar value). Recognizes supplementary code points that cannot be covered by a single character. | ||
| /// </summary> | ||
| public readonly Rune CurrentRune | ||
| { | ||
| get | ||
| { | ||
| int start = Start; | ||
| if (start > End) return default; | ||
| var first = Text[start]; | ||
| if (!char.IsSurrogate(first)) return new Rune(first); | ||
| if (char.IsHighSurrogate(first)) | ||
| { | ||
| if (start + 1 > End) return default; | ||
| var second = Text[start + 1]; | ||
| if (!char.IsLowSurrogate(second)) return default; | ||
| return new Rune(first, second); | ||
| } | ||
| if (start < 1) return default; | ||
| var trueFirst = Text[start - 1]; | ||
| if (!char.IsHighSurrogate(trueFirst)) return default; | ||
| return new Rune(trueFirst, first); | ||
|
Comment on lines
+147
to
+150
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a usage error, I would avoid reading backwards in these helpers. |
||
| } | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Gets a value indicating whether this instance is empty. | ||
| /// </summary> | ||
|
|
@@ -145,6 +171,35 @@ public readonly char this[int index] | |
| get => Text[index]; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Gets the Unicode scalar value (rune) at the specified index relative to the slice. | ||
| /// Recognizes supplementary code points that cannot be covered by a single character. | ||
| /// </summary> | ||
| /// <param name="index">The index relative to the slice.</param> | ||
| /// <returns>The rune at the specified index or the default value (refers to <c>'\0'</c>) if the index is out of range or the rune cannot be determined.</returns> | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public Rune RuneAt(int index) | ||
| { | ||
| var first = Text[index]; | ||
| if (!char.IsSurrogate(first)) | ||
| return new Rune(first); | ||
| if (char.IsHighSurrogate(first) && index + 1 <= End) | ||
| { | ||
| var second = Text[index + 1]; | ||
| if (char.IsLowSurrogate(second)) | ||
| return new Rune(first, second); | ||
| return default; | ||
| } | ||
| else if (index >= Start + 1) | ||
| { | ||
| var trueFirst = Text[index - 1]; | ||
| if (char.IsHighSurrogate(trueFirst)) | ||
| return new Rune(trueFirst, first); | ||
| return default; | ||
| } | ||
| return default; | ||
| } | ||
|
|
||
|
|
||
| /// <summary> | ||
| /// Goes to the next character, incrementing the <see cref="Start" /> position. | ||
|
|
@@ -166,6 +221,36 @@ public char NextChar() | |
| return Text[start]; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Goes to the next rune, incrementing the <see cref="Start"/> position. | ||
| /// </summary> | ||
| /// <returns> | ||
| /// The next rune. If none, returns default. | ||
| /// </returns> | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public Rune NextRune() | ||
| { | ||
| int start = Start; | ||
| if (start >= End) | ||
| { | ||
| Start = End + 1; | ||
| return default; | ||
| } | ||
| start++; | ||
| Start = start; | ||
| var first = Text[start]; | ||
| if (!char.IsSurrogate(first)) | ||
| return new Rune(first); | ||
| if (!char.IsHighSurrogate(first) || start + 1 > End) | ||
| return default; | ||
| var second = Text[start + 1]; | ||
| if (!char.IsLowSurrogate(second)) | ||
| return default; | ||
| start++; | ||
| Start = start; | ||
| return new Rune(first, second); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Goes to the next character, incrementing the <see cref="Start" /> position. | ||
| /// </summary> | ||
|
|
@@ -244,6 +329,53 @@ public readonly char PeekCharExtra(int offset) | |
| return (uint)index < (uint)text.Length ? text[index] : '\0'; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Peeks a rune at the specified offset from the current beginning of the slice | ||
| /// without using the range <see cref="Start"/> or <see cref="End"/>, returns default if outside the <see cref="Text"/>. | ||
| /// Recognizes supplementary code points that cannot be covered by a single character. | ||
| /// </summary> | ||
| /// <param name="offset">The offset.</param> | ||
| /// <returns>The rune at the specified offset, returns default if none.</returns> | ||
| /// | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| public readonly Rune PeekRuneExtra(int offset) | ||
| { | ||
| var index = Start + offset; | ||
| var text = Text; | ||
| if ((uint)index >= (uint)text.Length) | ||
| { | ||
| return default; | ||
| } | ||
| var resultOrLowSurrogate = text[index]; | ||
| if (!char.IsSurrogate(resultOrLowSurrogate)) | ||
| { | ||
| return new Rune(resultOrLowSurrogate); | ||
| } | ||
| if (!char.IsHighSurrogate(resultOrLowSurrogate)) | ||
| { | ||
| if (index + 1 >= text.Length) | ||
| { | ||
| return default; | ||
| } | ||
| var lowSurrogate = text[index + 1]; | ||
| if (!char.IsLowSurrogate(lowSurrogate)) | ||
| { | ||
| return default; | ||
| } | ||
| return new Rune(resultOrLowSurrogate, lowSurrogate); | ||
| } | ||
| if (index <= 1) | ||
| { | ||
| return default; | ||
| } | ||
| var highSurrogate = text[index - 1]; | ||
| if (!char.IsHighSurrogate(highSurrogate)) | ||
| { | ||
| return default; | ||
| } | ||
| return new Rune(highSurrogate, resultOrLowSurrogate); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Matches the specified text. | ||
| /// </summary> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,10 @@ | |
| <PackageReference Include="System.Memory" Version="4.6.0" /> | ||
| </ItemGroup> | ||
|
|
||
| <ItemGroup Condition=" '$(TargetFramework)' == 'net462' OR '$(TargetFramework)' == 'netstandard2.0' OR '$(TargetFramework)' == 'netstandard2.1'"> | ||
| <PackageReference Include="Shim.System.Text.Rune" Version="6.0.2" /> | ||
| </ItemGroup> | ||
|
Comment on lines
+31
to
+33
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove the new dependency, we should polyfill the required Rune logic instead The new |
||
|
|
||
| <ItemGroup> | ||
| <None Include="../../img/markdig.png" Pack="true" PackagePath="" /> | ||
| <None Include="../../readme.md" Pack="true" PackagePath="/"/> | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please make these new helpers internal.
No two specs agree on what space means, so I'd avoid exposing more ambiguous extensions on common types (it's not obvious this is CommonMark-specific).
(same for all new helpers in this file)