VYPR
Moderate severityNVD Advisory· Published Jun 30, 2023· Updated Nov 6, 2024

Infinite Loop when reading malformed objects in pypdf

CVE-2023-36807

Description

pypdf is a pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. In version 2.10.5 an attacker who uses this vulnerability can craft a PDF which leads to an infinite loop. This infinite loop blocks the current process and can utilize a single core of the CPU by 100%. It does not affect memory usage. That is, for example, the case if the user extracted metadata from such a malformed PDF. Versions prior to 2.10.5 throw an error, but do not hang forever. This issue was fixed with https://github.com/py-pdf/pypdf/pull/1331 which has been included in release 2.10.6. Users are advised to upgrade. Users unable to upgrade should modify PyPDF2/generic/_data_structures.py::read_object to an an error throwing case. See GHSA-hm9v-vj3r-r55m for details.

Affected packages

Versions sourced from the GitHub Security Advisory.

PackageAffected versionsPatched versions
PyPDF2PyPI
>= 2.10.5, < 2.10.62.10.6

Affected products

1

Patches

1
e6531a25325e

ROB: Fix infinite loop due to Invalid object (#1331)

https://github.com/py-pdf/pypdfpubpub-zzSep 9, 2022via ghsa
5 files changed · +58 17
  • PyPDF2/_cmap.py+2 1 modified
    @@ -5,7 +5,7 @@
     from ._codecs import adobe_glyphs, charset_encoding
     from ._utils import logger_warning
     from .errors import PdfReadWarning
    -from .generic import DecodedStreamObject, DictionaryObject
    +from .generic import DecodedStreamObject, DictionaryObject, NameObject
     
     
     # code freely inspired from @twiggy ; see #711
    @@ -124,6 +124,7 @@ def parse_encoding(
         enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object()  # type: ignore
         if isinstance(enc, str):
             try:
    +            enc = NameObject.unnumber(enc)  # for #xx decoding
                 if enc in charset_encoding:
                     encoding = charset_encoding[enc].copy()
                 elif enc in _predefined_cmap:
    
  • PyPDF2/generic/_base.py+11 2 modified
    @@ -420,6 +420,14 @@ def writeToStream(
             deprecate_with_replacement("writeToStream", "write_to_stream")
             self.write_to_stream(stream, encryption_key)
     
    +    @staticmethod
    +    def unnumber(sin: str) -> str:
    +        i = sin.find("#")
    +        while i >= 0:
    +            sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :]
    +            i = sin.find("#")
    +        return sin
    +
         @staticmethod
         def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject":  # PdfReader
             name = stream.read(1)
    @@ -431,10 +439,11 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject":  # PdfReader
                     ret = name.decode("utf-8")
                 except (UnicodeEncodeError, UnicodeDecodeError):
                     ret = name.decode("gbk")
    -            return NameObject(ret)
    -        except (UnicodeEncodeError, UnicodeDecodeError) as e:
                 # Name objects should represent irregular characters
                 # with a '#' followed by the symbol's hex number
    +            ret = NameObject.unnumber(ret)
    +            return NameObject(ret)
    +        except (UnicodeEncodeError, UnicodeDecodeError) as e:
                 if not pdf.strict:
                     logger_warning("Illegal character in Name Object", __name__)
                     return NameObject(name)
    
  • PyPDF2/generic/_data_structures.py+25 14 modified
    @@ -67,7 +67,6 @@
     from ._utils import read_hex_string_from_stream, read_string_from_stream
     
     logger = logging.getLogger(__name__)
    -ObjectPrefix = b"/<[tf(n%"
     NumberSigns = b"+-"
     IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")
     
    @@ -263,10 +262,19 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes:  # PdfReader
                     stream.read(1)
                     break
                 stream.seek(-1, 1)
    -            key = read_object(stream, pdf)
    -            tok = read_non_whitespace(stream)
    -            stream.seek(-1, 1)
    -            value = read_object(stream, pdf, forced_encoding)
    +            try:
    +                key = read_object(stream, pdf)
    +                tok = read_non_whitespace(stream)
    +                stream.seek(-1, 1)
    +                value = read_object(stream, pdf, forced_encoding)
    +            except Exception as exc:
    +                if pdf is not None and pdf.strict:
    +                    raise PdfReadError(exc.__repr__())
    +                logger_warning(exc.__repr__(), __name__)
    +                retval = DictionaryObject()
    +                retval.update(data)
    +                return retval  # return partial data
    +
                 if not data.get(key):
                     data[key] = value
                 else:
    @@ -812,10 +820,9 @@ def read_object(
     ) -> Union[PdfObject, int, str, ContentStream]:
         tok = stream.read(1)
         stream.seek(-1, 1)  # reset to start
    -    idx = ObjectPrefix.find(tok)
    -    if idx == 0:
    +    if tok == b"/":
             return NameObject.read_from_stream(stream, pdf)
    -    elif idx == 1:
    +    elif tok == b"<":
             # hexadecimal string OR dictionary
             peek = stream.read(2)
             stream.seek(-2, 1)  # reset to start
    @@ -824,15 +831,15 @@ def read_object(
                 return DictionaryObject.read_from_stream(stream, pdf, forced_encoding)
             else:
                 return read_hex_string_from_stream(stream, forced_encoding)
    -    elif idx == 2:
    +    elif tok == b"[":
             return ArrayObject.read_from_stream(stream, pdf, forced_encoding)
    -    elif idx == 3 or idx == 4:
    +    elif tok == b"t" or tok == b"f":
             return BooleanObject.read_from_stream(stream)
    -    elif idx == 5:
    +    elif tok == b"(":
             return read_string_from_stream(stream, forced_encoding)
    -    elif idx == 6:
    +    elif tok == b"n":
             return NullObject.read_from_stream(stream)
    -    elif idx == 7:
    +    elif tok == b"%":
             # comment
             while tok not in (b"\r", b"\n"):
                 tok = stream.read(1)
    @@ -843,14 +850,18 @@ def read_object(
             tok = read_non_whitespace(stream)
             stream.seek(-1, 1)
             return read_object(stream, pdf, forced_encoding)
    -    else:
    +    elif tok in b"0123456789+-.":
             # number object OR indirect reference
             peek = stream.read(20)
             stream.seek(-len(peek), 1)  # reset to start
             if IndirectPattern.match(peek) is not None:
                 return IndirectObject.read_from_stream(stream, pdf)
             else:
                 return NumberObject.read_from_stream(stream)
    +    else:
    +        raise PdfReadError(
    +            f"Invalid Elementary Object starting with {tok} @{stream.tell()}"  # type: ignore
    +        )
     
     
     class Field(TreeObject):
    
  • PyPDF2/_reader.py+9 0 modified
    @@ -1139,6 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
                         buf = bytes(self.stream.getbuffer())  # type: ignore
                     else:
                         p = self.stream.tell()
    +                    self.stream.seek(0, 0)
                         buf = self.stream.read(-1)
                         self.stream.seek(p, 0)
                     m = re.search(
    @@ -1192,6 +1193,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
                     buf = bytes(self.stream.getbuffer())  # type: ignore
                 else:
                     p = self.stream.tell()
    +                self.stream.seek(0, 0)
                     buf = self.stream.read(-1)
                     self.stream.seek(p, 0)
                 m = re.search(
    @@ -1883,6 +1885,13 @@ def xfa(self) -> Optional[Dict[str, Any]]:
                             retval[tag] = es
             return retval
     
    +    def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
    +        """
    +        used to ease development
    +        equivalent to generic.IndirectObject(num,gen,self).get_object()
    +        """
    +        return IndirectObject(num, gen, self).get_object()
    +
     
     class PdfFileReader(PdfReader):  # pragma: no cover
         def __init__(self, *args: Any, **kwargs: Any) -> None:
    
  • tests/test_generic.py+11 0 modified
    @@ -175,6 +175,17 @@ def test_NameObject():
         with pytest.raises(PdfReadError) as exc:
             NameObject.read_from_stream(stream, None)
         assert exc.value.args[0] == "name read error"
    +    assert (
    +        NameObject.read_from_stream(
    +            BytesIO(b"/A;Name_With-Various***Characters?"), None
    +        )
    +        == "/A;Name_With-Various***Characters?"
    +    )
    +    assert (
    +        NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None)
    +        == "/paired()parentheses"
    +    )
    +    assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"
     
     
     def test_destination_fit_r():
    

Vulnerability mechanics

Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.

References

6

News mentions

0

No linked articles in our index yet.