Docling: Unsafe Archive Extraction and XML Parsing in METS-GBS Backend
Description
Impact
The METS-GBS backend's XML parsing and the input document format detection lacked security controls, enabling: - XML External Entity (XXE) attacks to read local files or cause denial of service - Decompression bombs (zip bombs) to exhaust memory and disk space - Unbounded archive extraction consuming system resources
An attacker could craft malicious METS-GBS archives that, when processed, could read sensitive files, exhaust system resources, or cause application crashes.
Patches
Fixed in version 2.91.0. The fix implements: - Secure XML parsing with resolve_entities=False, load_dtd=False, and no_network=True - Configurable limits: 300 MB total extraction size, 10 MB per file, 1000 member count - Cumulative size tracking across all extractions - Early termination when limits are exceeded - Secure format detection of METS-GBS tar archives with _detect_mets_gbs() method: maximum file size (10 MB per file), maximum member count (1000 members), and exception handling to gracefully fail when limits are exceeded
Workarounds
Avoid processing METS-GBS archives from untrusted sources. If necessary, pre-validate archives in an isolated environment with resource limits.
### References - Fix release: v2.91.0
Affected products
2- Range: >=2.91.0
Patches
1c1dbac22c7a0fix: strengthen input validation for METS‑GBS processing (#3336)
6 files changed · +252 −22
docling/backend/mets_gbs_backend.py+64 −9 modified@@ -21,8 +21,8 @@ from PIL import Image from PIL.Image import Image as PILImage -from docling.backend.abstract_backend import PaginatedDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.backend_options import MetsGbsBackendOptions from docling.datamodel.base_models import InputFormat if TYPE_CHECKING: @@ -195,22 +195,46 @@ def _extract_confidence(title_str) -> float: class MetsGbsDocumentBackend(PdfDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): - super().__init__(in_doc, path_or_stream) - + def __init__( + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + options: MetsGbsBackendOptions = MetsGbsBackendOptions(), + ): + super().__init__(in_doc, path_or_stream, options) + self.options: MetsGbsBackendOptions self._tar: tarfile.TarFile = ( tarfile.open(name=self.path_or_stream, mode="r:gz") if isinstance(self.path_or_stream, Path) else tarfile.open(fileobj=self.path_or_stream, mode="r:gz") ) self.root_mets: Optional[etree._Element] = None self.page_map: Dict[int, _PageFiles] = {} + self._total_bytes_extracted = 0 + member_count = 0 for member in self._tar.getmembers(): + member_count += 1 + if member_count > self.options.max_member_count: + raise ValueError( + f"Archive exceeds maximum member count limit of {self.options.max_member_count}" + ) + if member.name.endswith(".xml"): file = self._tar.extractfile(member) if file is not None: - content = file.read() + content = file.read(self.options.max_file_bytes + 1) + if len(content) > self.options.max_file_bytes: + raise ValueError( + f"XML file {member.name} exceeds size limit of {self.options.max_file_bytes} bytes" + ) + + self._total_bytes_extracted += len(content) + if self._total_bytes_extracted > self.options.max_total_bytes: + raise ValueError( + f"Archive exceeds maximum total extraction size of {self.options.max_total_bytes} bytes" + ) + self.root_mets = self._validate_mets_xml(content) if self.root_mets is not None: break @@ -283,7 +307,11 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.page_map[page_no] = page_files def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]: - root: etree._Element = etree.fromstring(xml_string) + # Security: disable entity resolution + parser = etree.XMLParser( + resolve_entities=False, load_dtd=False, no_network=True + ) + root: etree._Element = etree.fromstring(xml_string, parser=parser) if ( root.tag == "{http://www.loc.gov/METS/}mets" and root.get("PROFILE") == "gbs" @@ -300,14 +328,41 @@ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]: ocr_info = self.page_map[page_no].coordOCR assert ocr_info is not None + # Security: limit extraction size to prevent decompression bombs image_file = self._tar.extractfile(image_info.path) assert image_file is not None - buf = BytesIO(image_file.read()) + image_data = image_file.read(self.options.max_file_bytes + 1) + if len(image_data) > self.options.max_file_bytes: + raise ValueError( + f"Image file {image_info.path} exceeds individual file size limit of {self.options.max_file_bytes} bytes" + ) + + # Security: Track total bytes extracted + self._total_bytes_extracted += len(image_data) + if self._total_bytes_extracted > self.options.max_total_bytes: + raise ValueError( + f"Total extracted data exceeds maximum limit of {self.options.max_total_bytes} bytes" + ) + + buf = BytesIO(image_data) im: PILImage = Image.open(buf) + ocr_file = self._tar.extractfile(ocr_info.path) assert ocr_file is not None - ocr_content = ocr_file.read() - parser = etree.HTMLParser() + ocr_content = ocr_file.read(self.options.max_file_bytes + 1) + if len(ocr_content) > self.options.max_file_bytes: + raise ValueError( + f"OCR file {ocr_info.path} exceeds individual file size limit of {self.options.max_file_bytes} bytes" + ) + + # Security: Track total bytes extracted + self._total_bytes_extracted += len(ocr_content) + if self._total_bytes_extracted > self.options.max_total_bytes: + raise ValueError( + f"Total extracted data exceeds maximum limit of {self.options.max_total_bytes} bytes" + ) + + parser = etree.HTMLParser(no_network=True) ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser) line_cells: List[TextCell] = []
docling/datamodel/backend_options.py+22 −0 modified@@ -127,6 +127,27 @@ class PdfBackendOptions(BaseBackendOptions): password: Optional[SecretStr] = None +class MetsGbsBackendOptions(PdfBackendOptions): + """Options specific to the METS-GBS document backend.""" + + kind: Annotated[Literal["mets-gbs"], Field(exclude=True, repr=False)] = "mets-gbs" # type: ignore[assignment] + max_total_bytes: Annotated[ + PositiveInt, + Field( + description="Maximum cumulative size in bytes of all data extracted from the archive during processing" + ), + ] = 300 * 1024 * 1024 + max_file_bytes: Annotated[ + PositiveInt, + Field( + description="Maximum size in bytes for any single file extracted from the archive" + ), + ] = 10 * 1024 * 1024 + max_member_count: Annotated[ + PositiveInt, Field(description="Maximum number of archive members to process") + ] = 1000 + + class MsExcelBackendOptions(BaseBackendOptions): """Options specific to the MS Excel backend.""" @@ -194,6 +215,7 @@ class XBRLBackendOptions(BaseBackendOptions): HTMLBackendOptions, MarkdownBackendOptions, PdfBackendOptions, + MetsGbsBackendOptions, MsExcelBackendOptions, LatexBackendOptions, XBRLBackendOptions,
docling/datamodel/document.py+39 −13 modified@@ -61,7 +61,7 @@ DeclarativeDocumentBackend, PaginatedDocumentBackend, ) -from docling.datamodel.backend_options import BackendOptions +from docling.datamodel.backend_options import BackendOptions, MetsGbsBackendOptions from docling.datamodel.base_models import ( AssembledUnit, ConfidenceReport, @@ -750,19 +750,45 @@ def _detect_csv( def _detect_mets_gbs( obj: Union[Path, DocumentStream], ) -> Optional[Literal["application/mets+xml"]]: + # Use default limits for safe format detection + default_options = MetsGbsBackendOptions() + max_file_bytes = default_options.max_file_bytes + max_member_count = default_options.max_member_count + content = obj if isinstance(obj, Path) else obj.stream tar: tarfile.TarFile member: tarfile.TarInfo - with tarfile.open( - name=content if isinstance(content, Path) else None, - fileobj=content if isinstance(content, BytesIO) else None, - mode="r:gz", - ) as tar: - for member in tar.getmembers(): - if member.name.endswith(".xml"): - file = tar.extractfile(member) - if file is not None: - content_str = file.read().decode(errors="ignore") - if "http://www.loc.gov/METS/" in content_str: - return "application/mets+xml" + member_count = 0 + + try: + with tarfile.open( + name=content if isinstance(content, Path) else None, + fileobj=content if isinstance(content, BytesIO) else None, + mode="r:gz", + ) as tar: + for member in tar.getmembers(): + member_count += 1 + if member_count > max_member_count: + _log.warning( + f"Archive exceeds member count limit ({max_member_count}) during format detection" + ) + return None + + if member.name.endswith(".xml"): + file = tar.extractfile(member) + if file is not None: + xml_content = file.read(max_file_bytes + 1) + if len(xml_content) > max_file_bytes: + _log.warning( + f"XML file {member.name} exceeds size limit ({max_file_bytes} bytes) during format detection" + ) + continue + + content_str = xml_content.decode(errors="ignore") + if "http://www.loc.gov/METS/" in content_str: + return "application/mets+xml" + except Exception as e: + _log.warning(f"Error during METS-GBS format detection: {e}") + return None + return None
docling/document_converter.py+7 −0 modified@@ -40,6 +40,7 @@ HTMLBackendOptions, LatexBackendOptions, MarkdownBackendOptions, + MetsGbsBackendOptions, PdfBackendOptions, XBRLBackendOptions, ) @@ -150,6 +151,12 @@ class PdfFormatOption(FormatOption): backend_options: Optional[PdfBackendOptions] = None +class MetsGbsFormatOption(FormatOption): + pipeline_cls: Type = StandardPdfPipeline + backend: Type[AbstractDocumentBackend] = MetsGbsDocumentBackend + backend_options: MetsGbsBackendOptions | None = None + + class AudioFormatOption(FormatOption): pipeline_cls: Type = AsrPipeline backend: Type[AbstractDocumentBackend] = NoOpBackend
tests/test_backend_mets_gbs.py+110 −0 modified@@ -3,6 +3,7 @@ import pytest from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend, MetsGbsPageBackend +from docling.datamodel.backend_options import MetsGbsBackendOptions from docling.datamodel.base_models import BoundingBox, InputFormat from docling.datamodel.document import InputDocument @@ -75,3 +76,112 @@ def test_num_pages(test_doc_path): # Explicitly clean up resources to prevent race conditions in CI doc_backend.unload() + + +def test_max_file_bytes_limit(test_doc_path): + """Test that max_file_bytes limit is enforced during extraction.""" + + options = MetsGbsBackendOptions(max_file_bytes=100) + + with pytest.raises(ValueError, match=r"exceeds.*size limit"): + InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + +def test_max_total_bytes_limit(test_doc_path): + """Test that max_total_bytes limit is enforced across all extractions.""" + + options = MetsGbsBackendOptions( + max_file_bytes=10 * 1024 * 1024, + max_total_bytes=1000, + ) + + with pytest.raises(ValueError, match="exceeds maximum total extraction size"): + InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + +def test_max_member_count_limit(test_doc_path): + """Test that max_member_count limit is enforced during extraction.""" + + options = MetsGbsBackendOptions(max_member_count=2) + + with pytest.raises(ValueError, match="exceeds maximum member count limit"): + InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + +def test_limits_with_valid_values(test_doc_path): + """Test that processing succeeds with generous limits.""" + options = MetsGbsBackendOptions( + max_file_bytes=10 * 1024 * 1024, # 10 MB + max_total_bytes=300 * 1024 * 1024, # 300 MB + max_member_count=1000, + ) + + in_doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + assert in_doc.valid + doc_backend: MetsGbsDocumentBackend = in_doc._backend + assert doc_backend.is_valid() + assert doc_backend.page_count() == 3 + + page_backend: MetsGbsPageBackend = doc_backend.load_page(0) + assert page_backend.is_valid() + + page_backend.unload() + doc_backend.unload() + + +def test_total_bytes_tracking_across_pages(test_doc_path): + """Test that total bytes are tracked cumulatively across initialization and page loading. + + This test ensures that when max_total_bytes is larger than max_file_bytes, + initialization succeeds but page loading eventually fails due to cumulative limit. + """ + options = MetsGbsBackendOptions( + max_file_bytes=10 * 1024 * 1024, + max_total_bytes=20 * 1024, + max_member_count=1000, + ) + + in_doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + assert in_doc.valid + doc_backend: MetsGbsDocumentBackend = in_doc._backend + assert doc_backend.is_valid() + + page_load_failed = False + for page_index in range(doc_backend.page_count()): + try: + page_backend: MetsGbsPageBackend = doc_backend.load_page(page_index) + page_backend.unload() + except ValueError as e: + assert "Total extracted data exceeds maximum limit" in str(e) + page_load_failed = True + break + + assert page_load_failed, "Expected page loading to fail due to total bytes limit" + doc_backend.unload()
tests/test_input_doc.py+10 −0 modified@@ -248,6 +248,16 @@ def test_guess_format(tmp_path): # Plain .txt file (not USPTO) should be detected as Markdown stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) assert dci._guess_format(stream) == InputFormat.MD + + # Valid METS-GBS archive + mets_gbs_path = Path("./tests/data/mets_gbs/32044009881525_select.tar.gz") + if mets_gbs_path.exists(): + assert dci._guess_format(mets_gbs_path) == InputFormat.METS_GBS + + buf = BytesIO(mets_gbs_path.open("rb").read()) + stream = DocumentStream(name="32044009881525_select.tar.gz", stream=buf) + assert dci._guess_format(stream) == InputFormat.METS_GBS + doc_path = temp_dir / "pftaps_wrong.txt" doc_path.write_text("xyz", encoding="utf-8") assert dci._guess_format(doc_path) == InputFormat.MD
Vulnerability mechanics
No source-code context for this CVE — mechanics is only generated when we can read the actual fix diff. Without that, the four sections (root cause, attack vector, affected code, fix) would be speculation rather than analysis.
References
3News mentions
1- Docling Project: Eight High-Severity Vulnerabilities Disclosed TogetherVypr Intelligence · Jun 3, 2026