diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/cmap-parsing-exception.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/cmap-parsing-exception.pdf new file mode 100644 index 000000000..3ff520c8a Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/cmap-parsing-exception.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/reference-2-numeric-error.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/reference-2-numeric-error.pdf new file mode 100644 index 000000000..8fbac3afe Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/reference-2-numeric-error.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index 1f7e35482..a846ccfcd 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -4,6 +4,36 @@ public class GithubIssuesTests { + [Fact] + public void Issue693() + { + var doc = IntegrationHelpers.GetDocumentPath("reference-2-numeric-error.pdf"); + + using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true })) + { + var page1 = document.GetPage(1); + Assert.Equal(1269, page1.Letters.Count); + } + } + + [Fact] + public void Issue692() + { + var doc = IntegrationHelpers.GetDocumentPath("cmap-parsing-exception.pdf"); + + using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true })) + { + var page1 = document.GetPage(1); + Assert.Equal(796, page1.Letters.Count); + } + + using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = false, SkipMissingFonts = false })) + { + var ex = Assert.Throws(() => document.GetPage(1)); + Assert.StartsWith("Read byte called on input bytes which was at end of byte set.", ex.Message); + } + } + [Fact] public void Issue874() { diff --git a/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs b/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs index 7ffa391ba..ac2950698 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/IntegrationDocumentTests.cs @@ -7,7 +7,8 @@ public class IntegrationDocumentTests [ "issue_671.pdf", "GHOSTSCRIPT-698363-0.pdf", - "ErcotFacts.pdf" + "ErcotFacts.pdf", + "cmap-parsing-exception.pdf" ]; [Theory] diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 8cf852927..14d06b336 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -166,7 +166,7 @@ private static PdfDocument OpenDocument( cidFontFactory, filterProvider, pdfScanner, - parsingOptions.Logger); + parsingOptions); var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader); diff --git a/src/UglyToad.PdfPig/PdfExtensions.cs b/src/UglyToad.PdfPig/PdfExtensions.cs index 303c39907..68d9012e6 100644 --- a/src/UglyToad.PdfPig/PdfExtensions.cs +++ b/src/UglyToad.PdfPig/PdfExtensions.cs @@ -1,11 +1,10 @@ namespace UglyToad.PdfPig { - using System; - using System.Collections.Generic; + using System; using System.Diagnostics.CodeAnalysis; using Core; using Filters; - using Parser.Parts; + using Parser.Parts; using Tokenization.Scanner; using Tokens; diff --git a/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs b/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs index ba631180f..ab551dc19 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs @@ -10,7 +10,7 @@ /// The CMap (character code map) maps character codes to character identifiers (CIDs). /// The set of characters which a CMap refers to is the "character set" (charset). /// - internal class CMap + internal sealed class CMap { public CharacterIdentifierSystemInfo Info { get; } @@ -140,13 +140,12 @@ public int ConvertToCid(int code) return 0; } - public override string ToString() { return Name; } - public int ReadCode(IInputBytes bytes) + public int ReadCode(IInputBytes bytes, bool useLenientParsing) { if (hasEmptyCodespace) { @@ -166,7 +165,7 @@ public int ReadCode(IInputBytes bytes) break; } - result[i] = ReadByte(bytes); + result[i] = ReadByte(bytes, useLenientParsing); } for (int i = minCodeLength - 1; i < maxCodeLength; i++) @@ -181,17 +180,23 @@ public int ReadCode(IInputBytes bytes) } if (byteCount < maxCodeLength) { - result[byteCount] = ReadByte(bytes); + result[byteCount] = ReadByte(bytes, useLenientParsing); } } throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}."); } - private static byte ReadByte(IInputBytes bytes) + private static byte ReadByte(IInputBytes bytes, bool useLenientParsing) { if (!bytes.MoveNext()) { + if (useLenientParsing) + { + // See issue #692 + return 0; + } + throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset); } @@ -208,6 +213,5 @@ private static int ByteArrayToInt(ReadOnlySpan data) } return code; } - } } diff --git a/src/UglyToad.PdfPig/PdfFonts/Composite/ToUnicodeCMap.cs b/src/UglyToad.PdfPig/PdfFonts/Composite/ToUnicodeCMap.cs index c114b0734..51a2b07f8 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Composite/ToUnicodeCMap.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Composite/ToUnicodeCMap.cs @@ -9,7 +9,7 @@ /// Defines the information content (actual text) of the font /// as opposed to the display format. /// - internal class ToUnicodeCMap + internal sealed class ToUnicodeCMap { private readonly CMap? cMap; @@ -45,9 +45,9 @@ public bool TryGet(int code, [NotNullWhen(true)] out string? value) return cMap.TryConvertToUnicode(code, out value); } - public int ReadCode(IInputBytes inputBytes) + public int ReadCode(IInputBytes inputBytes, bool useLenientParsing) { - return cMap!.ReadCode(inputBytes); + return cMap!.ReadCode(inputBytes, useLenientParsing); } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs b/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs index cf3fd5079..a141a9dbc 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs @@ -21,6 +21,8 @@ internal sealed class Type0Font : IFont, IVerticalWritingSupported private readonly Dictionary boundingBoxCache = new Dictionary(); + private readonly bool useLenientParsing; + public NameToken Name => BaseFont; public NameToken BaseFont { get; } @@ -41,6 +43,7 @@ public Type0Font( CMap cmap, CMap? toUnicodeCMap, CMap? ucs2CMap, + ParsingOptions parsingOptions, bool isChineseJapaneseOrKorean) { this.ucs2CMap = ucs2CMap; @@ -52,13 +55,15 @@ public Type0Font( ToUnicode = new ToUnicodeCMap(toUnicodeCMap); Details = cidFont.Details?.WithName(Name.Data) ?? FontDetails.GetDefault(Name.Data); + + useLenientParsing = parsingOptions.UseLenientParsing; } public int ReadCharacterCode(IInputBytes bytes, out int codeLength) { var current = bytes.CurrentOffset; - var code = CMap.ReadCode(bytes); + var code = CMap.ReadCode(bytes, useLenientParsing); codeLength = (int)(bytes.CurrentOffset - current); diff --git a/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type0FontHandler.cs b/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type0FontHandler.cs index 14b75c933..ee74415e2 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type0FontHandler.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type0FontHandler.cs @@ -15,23 +15,25 @@ using Tokens; using Util; - internal class Type0FontHandler : IFontHandler + internal sealed class Type0FontHandler : IFontHandler { private readonly CidFontFactory cidFontFactory; private readonly ILookupFilterProvider filterProvider; private readonly IPdfTokenScanner scanner; private readonly ILog logger; + private readonly ParsingOptions parsingOptions; public Type0FontHandler( CidFontFactory cidFontFactory, ILookupFilterProvider filterProvider, IPdfTokenScanner scanner, - ILog logger) + ParsingOptions parsingOptions) { this.cidFontFactory = cidFontFactory; this.filterProvider = filterProvider; this.scanner = scanner; - this.logger = logger; + logger = parsingOptions.Logger; + this.parsingOptions = parsingOptions; } public IFont Generate(DictionaryToken dictionary) @@ -91,7 +93,7 @@ public IFont Generate(DictionaryToken dictionary) } } - var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, isChineseJapaneseOrKorean); + var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, parsingOptions, isChineseJapaneseOrKorean); return font; } diff --git a/src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs b/src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs index dfc0167ef..9a9788b60 100644 --- a/src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs +++ b/src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs @@ -17,7 +17,7 @@ using UglyToad.PdfPig.Logging; using Util; - internal class CidFontFactory + internal sealed class CidFontFactory { private readonly ILookupFilterProvider filterProvider; private readonly IPdfTokenScanner pdfScanner; @@ -46,7 +46,7 @@ public CidFontFactory(ILog log, IPdfTokenScanner pdfScanner, ILookupFilterProvid defaultWidth = defaultWidthToken.Double; } - var verticalWritingMetrics = ReadVerticalDisplacements(dictionary); + var verticalWritingMetrics = ReadVerticalDisplacements(dictionary, pdfScanner); FontDescriptor? descriptor = null; if (TryGetFontDescriptor(dictionary, out var descriptorDictionary)) @@ -190,7 +190,7 @@ private IReadOnlyDictionary ReadWidths(DictionaryToken dict) return widths; } - private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict) + private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict, IPdfTokenScanner pdfScanner) { var verticalDisplacements = new Dictionary(); var positionVectors = new Dictionary(); @@ -210,22 +210,21 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken } // vertical metrics for individual CIDs. - if (dict.TryGet(NameToken.W2, out var w2Token) && w2Token is ArrayToken w2) + if (dict.TryGet(NameToken.W2, pdfScanner, out ArrayToken? w2)) { for (var i = 0; i < w2.Data.Count; i++) { - var c = (NumericToken)w2.Data[i]; + var c = DirectObjectFinder.Get(w2.Data[i], pdfScanner); var next = w2.Data[++i]; - - if (next is ArrayToken array) + if (DirectObjectFinder.TryGet(next, pdfScanner, out ArrayToken? array)) { for (var j = 0; j < array.Data.Count; j++) { var cid = c.Int + j; // ReSharper disable InconsistentNaming - var w1y = (NumericToken)array.Data[j]; - var v1x = (NumericToken)array.Data[++j]; - var v1y = (NumericToken)array.Data[++j]; + var w1y = DirectObjectFinder.Get(array.Data[j], pdfScanner); + var v1x = DirectObjectFinder.Get(array.Data[++j], pdfScanner); + var v1y = DirectObjectFinder.Get(array.Data[++j], pdfScanner); verticalDisplacements[cid] = w1y.Double; @@ -236,9 +235,9 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken { var first = c.Int; var last = ((NumericToken)next).Int; - var w1y = (NumericToken)w2.Data[++i]; - var v1x = (NumericToken)w2.Data[++i]; - var v1y = (NumericToken)w2.Data[++i]; + var w1y = DirectObjectFinder.Get(w2.Data[++i], pdfScanner); + var v1x = DirectObjectFinder.Get(w2.Data[++i], pdfScanner); + var v1y = DirectObjectFinder.Get(w2.Data[++i], pdfScanner); // ReSharper restore InconsistentNaming for (var cid = first; cid <= last; cid++) @@ -250,7 +249,7 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken } } } - + return new VerticalWritingMetrics(dw2, verticalDisplacements, positionVectors); }