Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/Markdig.Tests/TestEmphasisPlus.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ public void NormalStrongNormal()
TestParser.TestSpec("normal ***Strong emphasis*** normal", "<p>normal <em><strong>Strong emphasis</strong></em> normal</p>", "");
}

[Test]
public void SupplementaryPunctuation()
{
TestParser.TestSpec("a*a∇*a\n\na*∇a*a\n\na*a𝜵*a\n\na*𝜵a*a\n\na*𐬼a*a\n\na*a𐬼*a", "<p>a*a∇*a</p>\n<p>a*∇a*a</p>\n<p>a*a𝜵*a</p>\n<p>a*𝜵a*a</p>\n<p>a*𐬼a*a</p>\n<p>a*a𐬼*a</p>", "");
}

[Test]
public void RecognizeSupplementaryChars()
{
TestParser.TestSpec("🌶️**𰻞**🍜**𰻞**🌶️**麺**🍜", "<p>🌶️<strong>𰻞</strong>🍜<strong>𰻞</strong>🌶️<strong>麺</strong>🍜</p>", "");
}


[Test]
public void OpenEmphasisHasConvenientContentStringSlice()
{
Expand Down
10 changes: 10 additions & 0 deletions src/Markdig.Tests/TestSmartyPants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,14 @@ public void MappingCanBeReconfigured_HandlesRemovedMappings()

TestParser.TestSpec("<<test>>", "<p>&laquo;test&raquo;</p>", pipeline);
}

[Test]
public void RecognizesSupplementaryCharacters()
{
var pipeline = new MarkdownPipelineBuilder()
.UseSmartyPants()
.Build();

TestParser.TestSpec("\"𝜵\"𠮷\"𝜵\"𩸽\"", "<p>&ldquo;𝜵&ldquo;𠮷&rdquo;𝜵&ldquo;𩸽&rdquo;</p>", pipeline);
}
}
6 changes: 3 additions & 3 deletions src/Markdig/Extensions/SmartyPants/SmartyPantsInlineParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public override bool Match(InlineProcessor processor, ref StringSlice slice)
// -- – &ndash; 'ndash'
// --- — &mdash; 'mdash'

var pc = slice.PeekCharExtra(-1);
var pc = slice.PeekRuneExtra(-1);
var c = slice.CurrentChar;
var openingChar = c;

Expand Down Expand Up @@ -93,9 +93,9 @@ public override bool Match(InlineProcessor processor, ref StringSlice slice)
}

// Skip char
c = slice.NextChar();
var next = slice.NextRune();

CharHelper.CheckOpenCloseDelimiter(pc, c, false, out bool canOpen, out bool canClose);
CharHelper.CheckOpenCloseDelimiter(pc, next, false, out bool canOpen, out bool canClose);

bool postProcess = false;

Expand Down
68 changes: 59 additions & 9 deletions src/Markdig/Helpers/CharHelper.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.

using System.Buffers;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.CompilerServices;
using System.Text;

namespace Markdig.Helpers;

Expand Down Expand Up @@ -69,10 +70,10 @@ public static class CharHelper
private static readonly SearchValues<char> s_escapableSymbolChars = SearchValues.Create("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~•");

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsPunctuationException(char c) =>
c is '−' or '-' or '†' or '‡';
private static bool IsPunctuationException(Rune c) =>
c.IsBmp && (char)c.Value is '−' or '-' or '†' or '‡';

public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWord, out bool canOpen, out bool canClose)
public static void CheckOpenCloseDelimiter(Rune pc, Rune c, bool enableWithinWord, out bool canOpen, out bool canClose)
{
pc.CheckUnicodeCategory(out bool prevIsWhiteSpace, out bool prevIsPunctuation);
c.CheckUnicodeCategory(out bool nextIsWhiteSpace, out bool nextIsPunctuation);
Expand Down Expand Up @@ -100,13 +101,13 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor
if (!enableWithinWord)
{
var temp = canOpen;
// A single _ character can open emphasis iff it is part of a left-flanking delimiter run and either
// (a) not part of a right-flanking delimiter run or
// A single _ character can open emphasis iff it is part of a left-flanking delimiter run and either
// (a) not part of a right-flanking delimiter run or
// (b) part of a right-flanking delimiter run preceded by punctuation.
canOpen = canOpen && (!canClose || prevIsPunctuation);

// A single _ character can close emphasis iff it is part of a right-flanking delimiter run and either
// (a) not part of a left-flanking delimiter run or
// (a) not part of a left-flanking delimiter run or
// (b) part of a left-flanking delimiter run followed by punctuation.
canClose = canClose && (!temp || nextIsPunctuation);
}
Expand Down Expand Up @@ -199,6 +200,9 @@ public static bool IsWhitespace(this char c)
return IsWhitespaceRare(c);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsWhitespace(this Rune r) => r.IsBmp && IsWhitespace((char)r.Value);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please make these new helpers internal.

No two specs agree on what space means, so I'd avoid exposing more ambiguous extensions on common types (it's not obvious this is CommonMark-specific).

(same for all new helpers in this file)


[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsWhiteSpaceOrZero(this char c)
{
Expand Down Expand Up @@ -263,6 +267,52 @@ public static void CheckUnicodeCategory(this char c, out bool space, out bool pu
}
}

#if !(NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER)
private static Lazy<Func<int, UnicodeCategory>?> GetUnicodeCategoryReflection =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should fallback to CharUnicodeInfo.GetUnicodeCategory(value.ToString(), 0) instead.
It's the same thing that .NET itself uses in its Rune impl for Framework.

https://github.com/dotnet/runtime/blob/d594a04c1a2426ef295448c756f55965dedfc044/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs#L1279-L1287

new(() => (Func<int, UnicodeCategory>?)typeof(char).GetMethod("GetUnicodeCategory", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static)?.CreateDelegate(
typeof(Func<int, UnicodeCategory>)));
#endif

// Check if a char is a space or a punctuation
public static void CheckUnicodeCategory(this Rune c, out bool space, out bool punctuation)
{
if (IsWhitespace(c))
{
space = true;
punctuation = false;
}
else if (c.Value <= 127)
{
space = c.Value == 0;
punctuation = c.IsBmp && IsAsciiPunctuationOrZero((char)c.Value);
}
else
{
space = false;
punctuation = (CommonMarkPunctuationCategoryMask & (1 <<
#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER
(int)CharUnicodeInfo.GetUnicodeCategory(c.Value)
#else
(int)GetUnicodeCategoryFallback(c)
#endif
)) != 0;
}

#if !(NETSTANDARD2_1_OR_GREATER || NETCOREAPP2_1_OR_GREATER)
static UnicodeCategory GetUnicodeCategoryFallback(Rune c)
{
if (c.IsBmp) return CharUnicodeInfo.GetUnicodeCategory((char)c.Value);

if (GetUnicodeCategoryReflection.Value is Func<int, UnicodeCategory> GetUnicodeCategory)
{
return GetUnicodeCategory(c.Value);
}

return CharUnicodeInfo.GetUnicodeCategory(c.ToString(), 0);
}
#endif
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool IsSpaceOrPunctuationForGFMAutoLink(char c)
{
Expand Down Expand Up @@ -309,15 +359,15 @@ public static bool IsZero(this char c)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsSpace(this char c)
{
// 2.1 Characters and lines
// 2.1 Characters and lines
// A space is U+0020.
return c == ' ';
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsTab(this char c)
{
// 2.1 Characters and lines
// 2.1 Characters and lines
// A space is U+0009.
return c == '\t';
}
Expand Down
136 changes: 134 additions & 2 deletions src/Markdig/Helpers/StringSlice.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
// Copyright (c) Alexandre Mutel. All rights reserved.
// This file is licensed under the BSD-Clause 2 license.
// This file is licensed under the BSD-Clause 2 license.
// See the license.txt file in the project root for more information.

#nullable disable

using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;

namespace Markdig.Helpers;

Expand Down Expand Up @@ -114,7 +115,7 @@ internal StringSlice(string text, int start, int end, NewLine newLine, bool dumm
public NewLine NewLine;

/// <summary>
/// Gets the current character.
/// Gets the current character .
/// </summary>
public readonly char CurrentChar
{
Expand All @@ -125,6 +126,31 @@ public readonly char CurrentChar
}
}

/// <summary>
/// Gets the current rune (Unicode scalar value). Recognizes supplementary code points that cannot be covered by a single character.
/// </summary>
public readonly Rune CurrentRune
{
get
{
int start = Start;
if (start > End) return default;
var first = Text[start];
if (!char.IsSurrogate(first)) return new Rune(first);
if (char.IsHighSurrogate(first))
{
if (start + 1 > End) return default;
var second = Text[start + 1];
if (!char.IsLowSurrogate(second)) return default;
return new Rune(first, second);
}
if (start < 1) return default;
var trueFirst = Text[start - 1];
if (!char.IsHighSurrogate(trueFirst)) return default;
return new Rune(trueFirst, first);
Comment on lines +147 to +150
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a usage error, I would avoid reading backwards in these helpers.
(same for all of these)

}
}

/// <summary>
/// Gets a value indicating whether this instance is empty.
/// </summary>
Expand All @@ -145,6 +171,35 @@ public readonly char this[int index]
get => Text[index];
}

/// <summary>
/// Gets the Unicode scalar value (rune) at the specified index relative to the slice.
/// Recognizes supplementary code points that cannot be covered by a single character.
/// </summary>
/// <param name="index">The index relative to the slice.</param>
/// <returns>The rune at the specified index or the default value (refers to <c>'\0'</c>) if the index is out of range or the rune cannot be determined.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Rune RuneAt(int index)
{
var first = Text[index];
if (!char.IsSurrogate(first))
return new Rune(first);
if (char.IsHighSurrogate(first) && index + 1 <= End)
{
var second = Text[index + 1];
if (char.IsLowSurrogate(second))
return new Rune(first, second);
return default;
}
else if (index >= Start + 1)
{
var trueFirst = Text[index - 1];
if (char.IsHighSurrogate(trueFirst))
return new Rune(trueFirst, first);
return default;
}
return default;
}


/// <summary>
/// Goes to the next character, incrementing the <see cref="Start" /> position.
Expand All @@ -166,6 +221,36 @@ public char NextChar()
return Text[start];
}

/// <summary>
/// Goes to the next rune, incrementing the <see cref="Start"/> position.
/// </summary>
/// <returns>
/// The next rune. If none, returns default.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Rune NextRune()
{
int start = Start;
if (start >= End)
{
Start = End + 1;
return default;
}
start++;
Start = start;
var first = Text[start];
if (!char.IsSurrogate(first))
return new Rune(first);
if (!char.IsHighSurrogate(first) || start + 1 > End)
return default;
var second = Text[start + 1];
if (!char.IsLowSurrogate(second))
return default;
start++;
Start = start;
return new Rune(first, second);
}

/// <summary>
/// Goes to the next character, incrementing the <see cref="Start" /> position.
/// </summary>
Expand Down Expand Up @@ -244,6 +329,53 @@ public readonly char PeekCharExtra(int offset)
return (uint)index < (uint)text.Length ? text[index] : '\0';
}

/// <summary>
/// Peeks a rune at the specified offset from the current beginning of the slice
/// without using the range <see cref="Start"/> or <see cref="End"/>, returns default if outside the <see cref="Text"/>.
/// Recognizes supplementary code points that cannot be covered by a single character.
/// </summary>
/// <param name="offset">The offset.</param>
/// <returns>The rune at the specified offset, returns default if none.</returns>
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly Rune PeekRuneExtra(int offset)
{
var index = Start + offset;
var text = Text;
if ((uint)index >= (uint)text.Length)
{
return default;
}
var resultOrLowSurrogate = text[index];
if (!char.IsSurrogate(resultOrLowSurrogate))
{
return new Rune(resultOrLowSurrogate);
}
if (!char.IsHighSurrogate(resultOrLowSurrogate))
{
if (index + 1 >= text.Length)
{
return default;
}
var lowSurrogate = text[index + 1];
if (!char.IsLowSurrogate(lowSurrogate))
{
return default;
}
return new Rune(resultOrLowSurrogate, lowSurrogate);
}
if (index <= 1)
{
return default;
}
var highSurrogate = text[index - 1];
if (!char.IsHighSurrogate(highSurrogate))
{
return default;
}
return new Rune(highSurrogate, resultOrLowSurrogate);
}

/// <summary>
/// Matches the specified text.
/// </summary>
Expand Down
4 changes: 4 additions & 0 deletions src/Markdig/Markdig.targets
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
<PackageReference Include="System.Memory" Version="4.6.0" />
</ItemGroup>

<ItemGroup Condition=" '$(TargetFramework)' == 'net462' OR '$(TargetFramework)' == 'netstandard2.0' OR '$(TargetFramework)' == 'netstandard2.1'">
<PackageReference Include="Shim.System.Text.Rune" Version="6.0.2" />
</ItemGroup>
Comment on lines +31 to +33
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the new dependency, we should polyfill the required Rune logic instead
https://github.com/xoofx/markdig/tree/master/src/Markdig/Polyfills

The new Rune helpers on StringSlice can be internal (or at least internal on targets without Rune in the base libraries).


<ItemGroup>
<None Include="../../img/markdig.png" Pack="true" PackagePath="" />
<None Include="../../readme.md" Pack="true" PackagePath="/"/>
Expand Down
Loading
Loading