SEC: Avoid infinite loop when reading broken DCT-based inline images (#3501)

stefan6419846 · web-flow · commit f2864d6dd9ba · 2025-10-22T17:52:40.000+02:00
diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py
@@ -148,18 +148,26 @@ def extract_inline_DCT(stream: StreamType) -> bytes:
     Extract DCT (JPEG) stream from inline image.
     The stream will be moved onto the EI.
     """
+    def read(length: int) -> bytes:
+        # If 0 bytes are returned, and *size* was not 0, this indicates end of file.
+        # If the object is in non-blocking mode and no bytes are available, `None` is returned.
+        _result = stream.read(length)
+        if _result is None or len(_result) != length:
+            raise PdfReadError("Unexpected end of stream")
+        return _result
+
     data_out: bytes = b""
     # Read Blocks of data (ID/Size/data) up to ID=FF/D9
     # https://www.digicamsoft.com/itu/itu-t81-36.html
-    notfirst = False
+    not_first = False
     while True:
-        c = stream.read(1)
-        if notfirst or (c == b"\xff"):
+        c = read(1)
+        if not_first or (c == b"\xff"):
             data_out += c
         if c != b"\xff":
             continue
-        notfirst = True
-        c = stream.read(1)
+        not_first = True
+        c = read(1)
         data_out += c
         if c == b"\xff":
             stream.seek(-1, 1)  # pragma: no cover
@@ -172,10 +180,10 @@ def extract_inline_DCT(stream: StreamType) -> bytes:
             b"\xda\xdb\xdc\xdd\xde\xdf"
             b"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xfe"
         ):
-            c = stream.read(2)
+            c = read(2)
             data_out += c
             sz = c[0] * 256 + c[1]
-            data_out += stream.read(sz - 2)
+            data_out += read(sz - 2)
 
     ei_tok = read_non_whitespace(stream)
     ei_tok += stream.read(2)
diff --git a/tests/generic/test_image_inline.py b/tests/generic/test_image_inline.py
@@ -1,7 +1,12 @@
 """Test the pypdf.generic._image_inline module."""
 from io import BytesIO
 
+import pytest
+
+from pypdf import PdfReader
+from pypdf.errors import PdfReadError
 from pypdf.generic._image_inline import is_followed_by_binary_data
+from tests import get_data_from_url
 
 
 def test_is_followed_by_binary_data():
@@ -59,3 +64,14 @@ def test_is_followed_by_binary_data():
 
     stream = BytesIO(b"1234.56 42 13 37 10 20 c\n")
     assert not is_followed_by_binary_data(stream)
+
+
+@pytest.mark.enable_socket
+def test_extract_inline_dct__early_end_of_file():
+    url = "https://github.com/user-attachments/files/23056988/inline_dct__early_eof.pdf"
+    name = "inline_dct__early_eof.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+
+    with pytest.raises(expected_exception=PdfReadError, match=r"^Unexpected end of stream$"):
+        page.images[0].image.load()