Handle odd ligatures names and fix #945 (#946)

BobLd · web-flow · commit 9ad51067b067 · 2024-11-27T19:44:17.000Z
diff --git a/src/UglyToad.PdfPig.Fonts/GlyphList.cs b/src/UglyToad.PdfPig.Fonts/GlyphList.cs
@@ -13,7 +13,7 @@
     public class GlyphList
     {
         /// <summary>
-        /// <c>.notdef</c>.
+        /// <c>.notdef</c> name.
         /// </summary>
         public const string NotDefined = ".notdef";
 
@@ -37,7 +37,7 @@ public class GlyphList
         public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value;
 
         private static readonly Lazy<GlyphList> LazyZapfDingbatsGlyphList = new Lazy<GlyphList>(() => GlyphListFactory.Get("zapfdingbats"));
-
+        
         /// <summary>
         /// Zapf Dingbats.
         /// </summary>
@@ -84,6 +84,7 @@ public string UnicodeCodePointToName(int unicodeValue)
 
         /// <summary>
         /// Get the unicode value for the glyph name.
+        /// See <see href="https://github.com/adobe-type-tools/agl-specification"/>.
         /// </summary>
         public string NameToUnicode(string name)
         {
@@ -103,25 +104,47 @@ public string NameToUnicode(string name)
             }
 
             string unicode;
-            // Remove suffixes
+            // 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any.
             if (name.IndexOf('.') > 0)
             {
                 unicode = NameToUnicode(name.Substring(0, name.IndexOf('.')));
             }
-            else if (name.StartsWith("uni") && name.Length == 7)
+            // 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter.
+            else if (name.IndexOf('_') > 0)
+            {
+                /*
+                 * MOZILLA-3136-0.pdf
+                 * 68-1990-01_A.pdf
+                 * TIKA-2054-0.pdf
+                 */
+                var sb = new StringBuilder();
+                foreach (var s in name.Split('_'))
+                {
+                    sb.Append(NameToUnicode(s));
+                }
+
+                unicode = sb.ToString();
+            }
+            // Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069) followed by a sequence of uppercase hexadecimal
+            // digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), if the length of that sequence is a multiple
+            // of four, and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF, then
+            // interpret each as a Unicode scalar value and map the component to the string made of those scalar values. Note that the range
+            // and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP).
+            else if (name.StartsWith("uni") && (name.Length - 3) % 4 == 0)
             {
                 // test for Unicode name in the format uniXXXX where X is hex
                 int nameLength = name.Length;
 
                 var uniStr = new StringBuilder();
 
-                var foundUnicode = true;
                 for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4)
                 {
-                    if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var codePoint))
+                    if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4),
+                            NumberStyles.HexNumber,
+                            CultureInfo.InvariantCulture,
+                            out var codePoint))
                     {
-                        foundUnicode = false;
-                        break;
+                        return null;
                     }
 
                     if (codePoint > 0xD7FF && codePoint < 0xE000)
@@ -132,33 +155,30 @@ public string NameToUnicode(string name)
                     uniStr.Append((char)codePoint);
                 }
 
-                if (!foundUnicode)
-                {
-                    return null;
-                }
-
                 unicode = uniStr.ToString();
             }
-            else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length == 5)
+            // Otherwise, if the component is of the form ‘u’ (U+0075) followed by a sequence of four to six uppercase hexadecimal digits (0–9
+            // and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), and those digits represents a value in the ranges 0000 through
+            // D7FF or E000 through 10FFFF, then interpret it as a Unicode scalar value and map the component to the string made of this scalar value.
+            else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length >= 5 && name.Length <= 7)
             {
-                // test for an alternate Unicode name representation uXXXX
                 var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.HexNumber, CultureInfo.InvariantCulture);
 
                 if (codePoint > 0xD7FF && codePoint < 0xE000)
                 {
-                    throw new InvalidFontFormatException(
-                        $"Unicode character name with disallowed code area: {name}");
+                    throw new InvalidFontFormatException($"Unicode character name with disallowed code area: {name}");
                 }
 
                 unicode = char.ConvertFromUtf32(codePoint);
             }
+            // Ad-hoc special cases
             else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4)
             {
                 // name representation cXXX
                 var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture);
-                System.Diagnostics.Debug.Assert(codePoint > 0);
                 unicode = char.ConvertFromUtf32(codePoint);
             }
+            // Otherwise, map the component to an empty string.
             else
             {
                 return null;
diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs b/src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs
@@ -57,7 +57,7 @@ public void NameToUnicodeRemovesSuffix()
         {
             var list = new GlyphList(new Dictionary<string, string>
             {
-                {"Boris", "B"}
+                { "Boris", "B" }
             });
 
             var result = list.NameToUnicode("Boris.Special");
@@ -70,7 +70,7 @@ public void NameToUnicodeConvertsHexAndUsesHexValue()
         {
             var list = new GlyphList(new Dictionary<string, string>
             {
-                {"B", "X"}
+                { "B", "X" }
             });
 
             var result = list.NameToUnicode("uni0042");
@@ -83,12 +83,27 @@ public void NameToUnicodeConvertsShortHexAndUsesHexValue()
         {
             var list = new GlyphList(new Dictionary<string, string>
             {
-                {"E", "Æ"}
+                { "E", "Æ" }
             });
 
             var result = list.NameToUnicode("u0045");
 
             Assert.Equal("E", result);
         }
+
+
+        [Fact(Skip = "TODO - String don't match")]
+        public void NameToUnicodeConvertAglSpecification()
+        {
+            // https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#3-examples
+            var list = new GlyphList(new Dictionary<string, string>
+            {
+                { "Lcommaaccent", "\u013B" }
+            });
+
+            var result = list.NameToUnicode("Lcommaaccent_uni20AC0308_u1040C.alternate");
+
+            Assert.Equal("\u013B\u20AC\u0308\u1040C", result);
+        }
     }
 }
diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf
diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
@@ -6,6 +6,41 @@
 
     public class GithubIssuesTests
     {
+        [Fact]
+        public void Issue945()
+        {
+            // Odd ligatures names
+            var path = IntegrationHelpers.GetDocumentPath("MOZILLA-3136-0.pdf");
+            using (var document = PdfDocument.Open(path))
+            {
+                var page = document.GetPage(2);
+                Assert.Contains("ff", page.Letters.Select(l => l.Value));
+            }
+
+            path = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf");
+            using (var document = PdfDocument.Open(path))
+            {
+                var page = document.GetPage(7);
+                Assert.Contains("fi", page.Letters.Select(l => l.Value));
+            }
+
+            path = IntegrationHelpers.GetDocumentPath("TIKA-2054-0.pdf");
+            using (var document = PdfDocument.Open(path))
+            {
+                var page = document.GetPage(3);
+                Assert.Contains("fi", page.Letters.Select(l => l.Value));
+
+                page = document.GetPage(4);
+                Assert.Contains("ff", page.Letters.Select(l => l.Value));
+
+                page = document.GetPage(6);
+                Assert.Contains("fl", page.Letters.Select(l => l.Value));
+
+                page = document.GetPage(16);
+                Assert.Contains("ffi", page.Letters.Select(l => l.Value));
+            }
+        }
+
         [Fact]
         public void Issue943()
         {

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ public void NameToUnicodeRemovesSuffix()`
`57`	`57`	`{`
`58`	`58`	`var list = new GlyphList(new Dictionary<string, string>`
`59`	`59`	`{`
`60`		`- {"Boris", "B"}`
	`60`	`+ { "Boris", "B" }`
`61`	`61`	`});`
`62`	`62`
`63`	`63`	`var result = list.NameToUnicode("Boris.Special");`
`@@ -70,7 +70,7 @@ public void NameToUnicodeConvertsHexAndUsesHexValue()`
`70`	`70`	`{`
`71`	`71`	`var list = new GlyphList(new Dictionary<string, string>`
`72`	`72`	`{`
`73`		`- {"B", "X"}`
	`73`	`+ { "B", "X" }`
`74`	`74`	`});`
`75`	`75`
`76`	`76`	`var result = list.NameToUnicode("uni0042");`
`@@ -83,12 +83,27 @@ public void NameToUnicodeConvertsShortHexAndUsesHexValue()`
`83`	`83`	`{`
`84`	`84`	`var list = new GlyphList(new Dictionary<string, string>`
`85`	`85`	`{`
`86`		`- {"E", "Æ"}`
	`86`	`+ { "E", "Æ" }`
`87`	`87`	`});`
`88`	`88`
`89`	`89`	`var result = list.NameToUnicode("u0045");`
`90`	`90`
`91`	`91`	`Assert.Equal("E", result);`
`92`	`92`	`}`
	`93`	`+`
	`94`	`+`
	`95`	`+ [Fact(Skip = "TODO - String don't match")]`
	`96`	`+ public void NameToUnicodeConvertAglSpecification()`
	`97`	`+ {`
	`98`	`+ // https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#3-examples`
	`99`	`+ var list = new GlyphList(new Dictionary<string, string>`
	`100`	`+ {`
	`101`	`+ { "Lcommaaccent", "\u013B" }`
	`102`	`+ });`
	`103`	`+`
	`104`	`+ var result = list.NameToUnicode("Lcommaaccent_uni20AC0308_u1040C.alternate");`
	`105`	`+`
	`106`	`+ Assert.Equal("\u013B\u20AC\u0308\u1040C", result);`
	`107`	`+ }`
`93`	`108`	`}`
`94`	`109`	`}`