Skip to content

Commit 9ad5106

Browse files
authored
Handle odd ligatures names and fix #945 (#946)
1 parent 2080424 commit 9ad5106

File tree

4 files changed

+91
-21
lines changed

4 files changed

+91
-21
lines changed

src/UglyToad.PdfPig.Fonts/GlyphList.cs

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
public class GlyphList
1414
{
1515
/// <summary>
16-
/// <c>.notdef</c>.
16+
/// <c>.notdef</c> name.
1717
/// </summary>
1818
public const string NotDefined = ".notdef";
1919

@@ -37,7 +37,7 @@ public class GlyphList
3737
public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value;
3838

3939
private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats"));
40-
40+
4141
/// <summary>
4242
/// Zapf Dingbats.
4343
/// </summary>
@@ -84,6 +84,7 @@ public string UnicodeCodePointToName(int unicodeValue)
8484

8585
/// <summary>
8686
/// Get the unicode value for the glyph name.
87+
/// See <see href="https://github.com/adobe-type-tools/agl-specification"/>.
8788
/// </summary>
8889
public string NameToUnicode(string name)
8990
{
@@ -103,25 +104,47 @@ public string NameToUnicode(string name)
103104
}
104105

105106
string unicode;
106-
// Remove suffixes
107+
// 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
107108
if (name.IndexOf('.') > 0)
108109
{
109110
unicode = NameToUnicode(name.Substring(0, name.IndexOf('.')));
110111
}
111-
else if (name.StartsWith("uni") && name.Length == 7)
112+
// 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
113+
else if (name.IndexOf('_') > 0)
114+
{
115+
/*
116+
* MOZILLA-3136-0.pdf
117+
* 68-1990-01_A.pdf
118+
* TIKA-2054-0.pdf
119+
*/
120+
var sb = new StringBuilder();
121+
foreach (var s in name.Split('_'))
122+
{
123+
sb.Append(NameToUnicode(s));
124+
}
125+
126+
unicode = sb.ToString();
127+
}
128+
// Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069) followed by a sequence of uppercase hexadecimal
129+
// digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), if the length of that sequence is a multiple
130+
// of four, and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF, then
131+
// interpret each as a Unicode scalar value and map the component to the string made of those scalar values. Note that the range
132+
// and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
133+
else if (name.StartsWith("uni") && (name.Length - 3) % 4 == 0)
112134
{
113135
// test for Unicode name in the format uniXXXX where X is hex
114136
int nameLength = name.Length;
115137

116138
var uniStr = new StringBuilder();
117139

118-
var foundUnicode = true;
119140
for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4)
120141
{
121-
if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var codePoint))
142+
if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4),
143+
NumberStyles.HexNumber,
144+
CultureInfo.InvariantCulture,
145+
out var codePoint))
122146
{
123-
foundUnicode = false;
124-
break;
147+
return null;
125148
}
126149

127150
if (codePoint > 0xD7FF && codePoint < 0xE000)
@@ -132,33 +155,30 @@ public string NameToUnicode(string name)
132155
uniStr.Append((char)codePoint);
133156
}
134157

135-
if (!foundUnicode)
136-
{
137-
return null;
138-
}
139-
140158
unicode = uniStr.ToString();
141159
}
142-
else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length == 5)
160+
// Otherwise, if the component is of the form ‘u’ (U+0075) followed by a sequence of four to six uppercase hexadecimal digits (0–9
161+
// and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), and those digits represents a value in the ranges 0000 through
162+
// D7FF or E000 through 10FFFF, then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
163+
else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length >= 5 && name.Length <= 7)
143164
{
144-
// test for an alternate Unicode name representation uXXXX
145165
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.HexNumber, CultureInfo.InvariantCulture);
146166

147167
if (codePoint > 0xD7FF && codePoint < 0xE000)
148168
{
149-
throw new InvalidFontFormatException(
150-
$"Unicode character name with disallowed code area: {name}");
169+
throw new InvalidFontFormatException($"Unicode character name with disallowed code area: {name}");
151170
}
152171

153172
unicode = char.ConvertFromUtf32(codePoint);
154173
}
174+
// Ad-hoc special cases
155175
else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4)
156176
{
157177
// name representation cXXX
158178
var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture);
159-
System.Diagnostics.Debug.Assert(codePoint > 0);
160179
unicode = char.ConvertFromUtf32(codePoint);
161180
}
181+
// Otherwise, map the component to an empty string.
162182
else
163183
{
164184
return null;

src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ public void NameToUnicodeRemovesSuffix()
5757
{
5858
var list = new GlyphList(new Dictionary<string, string>
5959
{
60-
{"Boris", "B"}
60+
{ "Boris", "B" }
6161
});
6262

6363
var result = list.NameToUnicode("Boris.Special");
@@ -70,7 +70,7 @@ public void NameToUnicodeConvertsHexAndUsesHexValue()
7070
{
7171
var list = new GlyphList(new Dictionary<string, string>
7272
{
73-
{"B", "X"}
73+
{ "B", "X" }
7474
});
7575

7676
var result = list.NameToUnicode("uni0042");
@@ -83,12 +83,27 @@ public void NameToUnicodeConvertsShortHexAndUsesHexValue()
8383
{
8484
var list = new GlyphList(new Dictionary<string, string>
8585
{
86-
{"E", "Æ"}
86+
{ "E", "Æ" }
8787
});
8888

8989
var result = list.NameToUnicode("u0045");
9090

9191
Assert.Equal("E", result);
9292
}
93+
94+
95+
[Fact(Skip = "TODO - String don't match")]
96+
public void NameToUnicodeConvertAglSpecification()
97+
{
98+
// https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#3-examples
99+
var list = new GlyphList(new Dictionary<string, string>
100+
{
101+
{ "Lcommaaccent", "\u013B" }
102+
});
103+
104+
var result = list.NameToUnicode("Lcommaaccent_uni20AC0308_u1040C.alternate");
105+
106+
Assert.Equal("\u013B\u20AC\u0308\u1040C", result);
107+
}
93108
}
94109
}
3.43 MB
Binary file not shown.

src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,41 @@
66

77
public class GithubIssuesTests
88
{
9+
[Fact]
10+
public void Issue945()
11+
{
12+
// Odd ligatures names
13+
var path = IntegrationHelpers.GetDocumentPath("MOZILLA-3136-0.pdf");
14+
using (var document = PdfDocument.Open(path))
15+
{
16+
var page = document.GetPage(2);
17+
Assert.Contains("ff", page.Letters.Select(l => l.Value));
18+
}
19+
20+
path = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
21+
using (var document = PdfDocument.Open(path))
22+
{
23+
var page = document.GetPage(7);
24+
Assert.Contains("fi", page.Letters.Select(l => l.Value));
25+
}
26+
27+
path = IntegrationHelpers.GetDocumentPath("TIKA-2054-0.pdf");
28+
using (var document = PdfDocument.Open(path))
29+
{
30+
var page = document.GetPage(3);
31+
Assert.Contains("fi", page.Letters.Select(l => l.Value));
32+
33+
page = document.GetPage(4);
34+
Assert.Contains("ff", page.Letters.Select(l => l.Value));
35+
36+
page = document.GetPage(6);
37+
Assert.Contains("fl", page.Letters.Select(l => l.Value));
38+
39+
page = document.GetPage(16);
40+
Assert.Contains("ffi", page.Letters.Select(l => l.Value));
41+
}
42+
}
43+
944
[Fact]
1045
public void Issue943()
1146
{

0 commit comments

Comments
 (0)