Infinite Loop when reading malformed objects in pypdf
Description
pypdf is a pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files. In version 2.10.5 an attacker who uses this vulnerability can craft a PDF which leads to an infinite loop. This infinite loop blocks the current process and can utilize a single core of the CPU by 100%. It does not affect memory usage. That is, for example, the case if the user extracted metadata from such a malformed PDF. Versions prior to 2.10.5 throw an error, but do not hang forever. This issue was fixed with https://github.com/py-pdf/pypdf/pull/1331 which has been included in release 2.10.6. Users are advised to upgrade. Users unable to upgrade should modify PyPDF2/generic/_data_structures.py::read_object to an an error throwing case. See GHSA-hm9v-vj3r-r55m for details.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
PyPDF2PyPI | >= 2.10.5, < 2.10.6 | 2.10.6 |
Affected products
1Patches
1e6531a25325eROB: Fix infinite loop due to Invalid object (#1331)
5 files changed · +58 −17
PyPDF2/_cmap.py+2 −1 modified@@ -5,7 +5,7 @@ from ._codecs import adobe_glyphs, charset_encoding from ._utils import logger_warning from .errors import PdfReadWarning -from .generic import DecodedStreamObject, DictionaryObject +from .generic import DecodedStreamObject, DictionaryObject, NameObject # code freely inspired from @twiggy ; see #711 @@ -124,6 +124,7 @@ def parse_encoding( enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore if isinstance(enc, str): try: + enc = NameObject.unnumber(enc) # for #xx decoding if enc in charset_encoding: encoding = charset_encoding[enc].copy() elif enc in _predefined_cmap:
PyPDF2/generic/_base.py+11 −2 modified@@ -420,6 +420,14 @@ def writeToStream( deprecate_with_replacement("writeToStream", "write_to_stream") self.write_to_stream(stream, encryption_key) + @staticmethod + def unnumber(sin: str) -> str: + i = sin.find("#") + while i >= 0: + sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :] + i = sin.find("#") + return sin + @staticmethod def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader name = stream.read(1) @@ -431,10 +439,11 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader ret = name.decode("utf-8") except (UnicodeEncodeError, UnicodeDecodeError): ret = name.decode("gbk") - return NameObject(ret) - except (UnicodeEncodeError, UnicodeDecodeError) as e: # Name objects should represent irregular characters # with a '#' followed by the symbol's hex number + ret = NameObject.unnumber(ret) + return NameObject(ret) + except (UnicodeEncodeError, UnicodeDecodeError) as e: if not pdf.strict: logger_warning("Illegal character in Name Object", __name__) return NameObject(name)
PyPDF2/generic/_data_structures.py+25 −14 modified@@ -67,7 +67,6 @@ from ._utils import read_hex_string_from_stream, read_string_from_stream logger = logging.getLogger(__name__) -ObjectPrefix = b"/<[tf(n%" NumberSigns = b"+-" IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]") @@ -263,10 +262,19 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader stream.read(1) break stream.seek(-1, 1) - key = read_object(stream, pdf) - tok = read_non_whitespace(stream) - stream.seek(-1, 1) - value = read_object(stream, pdf, forced_encoding) + try: + key = read_object(stream, pdf) + tok = read_non_whitespace(stream) + stream.seek(-1, 1) + value = read_object(stream, pdf, forced_encoding) + except Exception as exc: + if pdf is not None and pdf.strict: + raise PdfReadError(exc.__repr__()) + logger_warning(exc.__repr__(), __name__) + retval = DictionaryObject() + retval.update(data) + return retval # return partial data + if not data.get(key): data[key] = value else: @@ -812,10 +820,9 @@ def read_object( ) -> Union[PdfObject, int, str, ContentStream]: tok = stream.read(1) stream.seek(-1, 1) # reset to start - idx = ObjectPrefix.find(tok) - if idx == 0: + if tok == b"/": return NameObject.read_from_stream(stream, pdf) - elif idx == 1: + elif tok == b"<": # hexadecimal string OR dictionary peek = stream.read(2) stream.seek(-2, 1) # reset to start @@ -824,15 +831,15 @@ def read_object( return DictionaryObject.read_from_stream(stream, pdf, forced_encoding) else: return read_hex_string_from_stream(stream, forced_encoding) - elif idx == 2: + elif tok == b"[": return ArrayObject.read_from_stream(stream, pdf, forced_encoding) - elif idx == 3 or idx == 4: + elif tok == b"t" or tok == b"f": return BooleanObject.read_from_stream(stream) - elif idx == 5: + elif tok == b"(": return read_string_from_stream(stream, forced_encoding) - elif idx == 6: + elif tok == b"n": return NullObject.read_from_stream(stream) - elif idx == 7: + elif tok == b"%": # comment while tok not in (b"\r", b"\n"): tok = stream.read(1) @@ -843,14 +850,18 @@ def read_object( tok = read_non_whitespace(stream) stream.seek(-1, 1) return read_object(stream, pdf, forced_encoding) - else: + elif tok in b"0123456789+-.": # number object OR indirect reference peek = stream.read(20) stream.seek(-len(peek), 1) # reset to start if IndirectPattern.match(peek) is not None: return IndirectObject.read_from_stream(stream, pdf) else: return NumberObject.read_from_stream(stream) + else: + raise PdfReadError( + f"Invalid Elementary Object starting with {tok} @{stream.tell()}" # type: ignore + ) class Field(TreeObject):
PyPDF2/_reader.py+9 −0 modified@@ -1139,6 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() + self.stream.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( @@ -1192,6 +1193,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]: buf = bytes(self.stream.getbuffer()) # type: ignore else: p = self.stream.tell() + self.stream.seek(0, 0) buf = self.stream.read(-1) self.stream.seek(p, 0) m = re.search( @@ -1883,6 +1885,13 @@ def xfa(self) -> Optional[Dict[str, Any]]: retval[tag] = es return retval + def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]: + """ + used to ease development + equivalent to generic.IndirectObject(num,gen,self).get_object() + """ + return IndirectObject(num, gen, self).get_object() + class PdfFileReader(PdfReader): # pragma: no cover def __init__(self, *args: Any, **kwargs: Any) -> None:
tests/test_generic.py+11 −0 modified@@ -175,6 +175,17 @@ def test_NameObject(): with pytest.raises(PdfReadError) as exc: NameObject.read_from_stream(stream, None) assert exc.value.args[0] == "name read error" + assert ( + NameObject.read_from_stream( + BytesIO(b"/A;Name_With-Various***Characters?"), None + ) + == "/A;Name_With-Various***Characters?" + ) + assert ( + NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None) + == "/paired()parentheses" + ) + assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB" def test_destination_fit_r():
Vulnerability mechanics
Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.
References
6- github.com/advisories/GHSA-hm9v-vj3r-r55mghsaADVISORY
- nvd.nist.gov/vuln/detail/CVE-2023-36807ghsaADVISORY
- github.com/py-pdf/pypdf/commit/e6531a25325e7e0174b6a1ba03b57320b5227f6bghsaWEB
- github.com/py-pdf/pypdf/issues/1329ghsax_refsource_MISCWEB
- github.com/py-pdf/pypdf/pull/1331ghsax_refsource_MISCWEB
- github.com/py-pdf/pypdf/security/advisories/GHSA-hm9v-vj3r-r55mghsax_refsource_CONFIRMWEB
News mentions
0No linked articles in our index yet.