Docling: Unsafe Zip Extraction in EasyOCR Model Download
Description
Impact
In versions < 2.91.0, The EasyOCR model download functionality extracted ZIP archives without validating member paths, enabling Zip Slip attacks. If an attacker could compromise the model download source (via supply chain attack, DNS spoofing, or MITM), they could write arbitrary files to any location writable by the process, potentially achieving: - Remote code execution by overwriting Python files or system binaries - Persistent backdoors by modifying startup scripts or SSH keys - Data corruption or system compromise
Patches
Fixed in version 2.91.0. The extraction process now validates each archive member path using os.path.realpath() to ensure it remains within the target directory, raising a SecurityError for any path traversal attempts.
Workarounds
Ensure model downloads occur over secure, authenticated channels. Use integrity verification (checksums) for downloaded models. Run the application with minimal file system permissions.
### References - Fix release: v2.91.0
Affected products
1- Range: <2.91.0
Patches
25e161ac18548fix: EasyOCR model downloading (#3339)
2 files changed · +13 −1
docling/exceptions.py+4 −0 modified@@ -8,3 +8,7 @@ class ConversionError(BaseError): class OperationNotAllowed(BaseError): pass + + +class SecurityError(BaseError): + pass
docling/models/stages/ocr/easyocr_model.py+9 −1 modified@@ -1,4 +1,5 @@ import logging +import os import warnings import zipfile from collections.abc import Iterable @@ -17,6 +18,7 @@ OcrOptions, ) from docling.datamodel.settings import settings +from docling.exceptions import SecurityError from docling.models.base_ocr_model import BaseOcrModel from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder @@ -122,7 +124,13 @@ def download_models( for model_details in download_list: buf = download_url_with_progress(model_details["url"], progress=progress) with zipfile.ZipFile(buf, "r") as zip_ref: - zip_ref.extractall(local_dir) + for member in zip_ref.infolist(): + member_path = os.path.realpath( + os.path.join(local_dir, member.filename) + ) + if not member_path.startswith(os.path.realpath(local_dir) + os.sep): + raise SecurityError(f"ZIP slip attempt: {member.filename}") + zip_ref.extract(member, local_dir) return local_dir
c1dbac22c7a0fix: strengthen input validation for METS‑GBS processing (#3336)
6 files changed · +252 −22
docling/backend/mets_gbs_backend.py+64 −9 modified@@ -21,8 +21,8 @@ from PIL import Image from PIL.Image import Image as PILImage -from docling.backend.abstract_backend import PaginatedDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.backend_options import MetsGbsBackendOptions from docling.datamodel.base_models import InputFormat if TYPE_CHECKING: @@ -195,22 +195,46 @@ def _extract_confidence(title_str) -> float: class MetsGbsDocumentBackend(PdfDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): - super().__init__(in_doc, path_or_stream) - + def __init__( + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + options: MetsGbsBackendOptions = MetsGbsBackendOptions(), + ): + super().__init__(in_doc, path_or_stream, options) + self.options: MetsGbsBackendOptions self._tar: tarfile.TarFile = ( tarfile.open(name=self.path_or_stream, mode="r:gz") if isinstance(self.path_or_stream, Path) else tarfile.open(fileobj=self.path_or_stream, mode="r:gz") ) self.root_mets: Optional[etree._Element] = None self.page_map: Dict[int, _PageFiles] = {} + self._total_bytes_extracted = 0 + member_count = 0 for member in self._tar.getmembers(): + member_count += 1 + if member_count > self.options.max_member_count: + raise ValueError( + f"Archive exceeds maximum member count limit of {self.options.max_member_count}" + ) + if member.name.endswith(".xml"): file = self._tar.extractfile(member) if file is not None: - content = file.read() + content = file.read(self.options.max_file_bytes + 1) + if len(content) > self.options.max_file_bytes: + raise ValueError( + f"XML file {member.name} exceeds size limit of {self.options.max_file_bytes} bytes" + ) + + self._total_bytes_extracted += len(content) + if self._total_bytes_extracted > self.options.max_total_bytes: + raise ValueError( + f"Archive exceeds maximum total extraction size of {self.options.max_total_bytes} bytes" + ) + self.root_mets = self._validate_mets_xml(content) if self.root_mets is not None: break @@ -283,7 +307,11 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.page_map[page_no] = page_files def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]: - root: etree._Element = etree.fromstring(xml_string) + # Security: disable entity resolution + parser = etree.XMLParser( + resolve_entities=False, load_dtd=False, no_network=True + ) + root: etree._Element = etree.fromstring(xml_string, parser=parser) if ( root.tag == "{http://www.loc.gov/METS/}mets" and root.get("PROFILE") == "gbs" @@ -300,14 +328,41 @@ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]: ocr_info = self.page_map[page_no].coordOCR assert ocr_info is not None + # Security: limit extraction size to prevent decompression bombs image_file = self._tar.extractfile(image_info.path) assert image_file is not None - buf = BytesIO(image_file.read()) + image_data = image_file.read(self.options.max_file_bytes + 1) + if len(image_data) > self.options.max_file_bytes: + raise ValueError( + f"Image file {image_info.path} exceeds individual file size limit of {self.options.max_file_bytes} bytes" + ) + + # Security: Track total bytes extracted + self._total_bytes_extracted += len(image_data) + if self._total_bytes_extracted > self.options.max_total_bytes: + raise ValueError( + f"Total extracted data exceeds maximum limit of {self.options.max_total_bytes} bytes" + ) + + buf = BytesIO(image_data) im: PILImage = Image.open(buf) + ocr_file = self._tar.extractfile(ocr_info.path) assert ocr_file is not None - ocr_content = ocr_file.read() - parser = etree.HTMLParser() + ocr_content = ocr_file.read(self.options.max_file_bytes + 1) + if len(ocr_content) > self.options.max_file_bytes: + raise ValueError( + f"OCR file {ocr_info.path} exceeds individual file size limit of {self.options.max_file_bytes} bytes" + ) + + # Security: Track total bytes extracted + self._total_bytes_extracted += len(ocr_content) + if self._total_bytes_extracted > self.options.max_total_bytes: + raise ValueError( + f"Total extracted data exceeds maximum limit of {self.options.max_total_bytes} bytes" + ) + + parser = etree.HTMLParser(no_network=True) ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser) line_cells: List[TextCell] = []
docling/datamodel/backend_options.py+22 −0 modified@@ -127,6 +127,27 @@ class PdfBackendOptions(BaseBackendOptions): password: Optional[SecretStr] = None +class MetsGbsBackendOptions(PdfBackendOptions): + """Options specific to the METS-GBS document backend.""" + + kind: Annotated[Literal["mets-gbs"], Field(exclude=True, repr=False)] = "mets-gbs" # type: ignore[assignment] + max_total_bytes: Annotated[ + PositiveInt, + Field( + description="Maximum cumulative size in bytes of all data extracted from the archive during processing" + ), + ] = 300 * 1024 * 1024 + max_file_bytes: Annotated[ + PositiveInt, + Field( + description="Maximum size in bytes for any single file extracted from the archive" + ), + ] = 10 * 1024 * 1024 + max_member_count: Annotated[ + PositiveInt, Field(description="Maximum number of archive members to process") + ] = 1000 + + class MsExcelBackendOptions(BaseBackendOptions): """Options specific to the MS Excel backend.""" @@ -194,6 +215,7 @@ class XBRLBackendOptions(BaseBackendOptions): HTMLBackendOptions, MarkdownBackendOptions, PdfBackendOptions, + MetsGbsBackendOptions, MsExcelBackendOptions, LatexBackendOptions, XBRLBackendOptions,
docling/datamodel/document.py+39 −13 modified@@ -61,7 +61,7 @@ DeclarativeDocumentBackend, PaginatedDocumentBackend, ) -from docling.datamodel.backend_options import BackendOptions +from docling.datamodel.backend_options import BackendOptions, MetsGbsBackendOptions from docling.datamodel.base_models import ( AssembledUnit, ConfidenceReport, @@ -750,19 +750,45 @@ def _detect_csv( def _detect_mets_gbs( obj: Union[Path, DocumentStream], ) -> Optional[Literal["application/mets+xml"]]: + # Use default limits for safe format detection + default_options = MetsGbsBackendOptions() + max_file_bytes = default_options.max_file_bytes + max_member_count = default_options.max_member_count + content = obj if isinstance(obj, Path) else obj.stream tar: tarfile.TarFile member: tarfile.TarInfo - with tarfile.open( - name=content if isinstance(content, Path) else None, - fileobj=content if isinstance(content, BytesIO) else None, - mode="r:gz", - ) as tar: - for member in tar.getmembers(): - if member.name.endswith(".xml"): - file = tar.extractfile(member) - if file is not None: - content_str = file.read().decode(errors="ignore") - if "http://www.loc.gov/METS/" in content_str: - return "application/mets+xml" + member_count = 0 + + try: + with tarfile.open( + name=content if isinstance(content, Path) else None, + fileobj=content if isinstance(content, BytesIO) else None, + mode="r:gz", + ) as tar: + for member in tar.getmembers(): + member_count += 1 + if member_count > max_member_count: + _log.warning( + f"Archive exceeds member count limit ({max_member_count}) during format detection" + ) + return None + + if member.name.endswith(".xml"): + file = tar.extractfile(member) + if file is not None: + xml_content = file.read(max_file_bytes + 1) + if len(xml_content) > max_file_bytes: + _log.warning( + f"XML file {member.name} exceeds size limit ({max_file_bytes} bytes) during format detection" + ) + continue + + content_str = xml_content.decode(errors="ignore") + if "http://www.loc.gov/METS/" in content_str: + return "application/mets+xml" + except Exception as e: + _log.warning(f"Error during METS-GBS format detection: {e}") + return None + return None
docling/document_converter.py+7 −0 modified@@ -40,6 +40,7 @@ HTMLBackendOptions, LatexBackendOptions, MarkdownBackendOptions, + MetsGbsBackendOptions, PdfBackendOptions, XBRLBackendOptions, ) @@ -150,6 +151,12 @@ class PdfFormatOption(FormatOption): backend_options: Optional[PdfBackendOptions] = None +class MetsGbsFormatOption(FormatOption): + pipeline_cls: Type = StandardPdfPipeline + backend: Type[AbstractDocumentBackend] = MetsGbsDocumentBackend + backend_options: MetsGbsBackendOptions | None = None + + class AudioFormatOption(FormatOption): pipeline_cls: Type = AsrPipeline backend: Type[AbstractDocumentBackend] = NoOpBackend
tests/test_backend_mets_gbs.py+110 −0 modified@@ -3,6 +3,7 @@ import pytest from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend, MetsGbsPageBackend +from docling.datamodel.backend_options import MetsGbsBackendOptions from docling.datamodel.base_models import BoundingBox, InputFormat from docling.datamodel.document import InputDocument @@ -75,3 +76,112 @@ def test_num_pages(test_doc_path): # Explicitly clean up resources to prevent race conditions in CI doc_backend.unload() + + +def test_max_file_bytes_limit(test_doc_path): + """Test that max_file_bytes limit is enforced during extraction.""" + + options = MetsGbsBackendOptions(max_file_bytes=100) + + with pytest.raises(ValueError, match=r"exceeds.*size limit"): + InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + +def test_max_total_bytes_limit(test_doc_path): + """Test that max_total_bytes limit is enforced across all extractions.""" + + options = MetsGbsBackendOptions( + max_file_bytes=10 * 1024 * 1024, + max_total_bytes=1000, + ) + + with pytest.raises(ValueError, match="exceeds maximum total extraction size"): + InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + +def test_max_member_count_limit(test_doc_path): + """Test that max_member_count limit is enforced during extraction.""" + + options = MetsGbsBackendOptions(max_member_count=2) + + with pytest.raises(ValueError, match="exceeds maximum member count limit"): + InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + +def test_limits_with_valid_values(test_doc_path): + """Test that processing succeeds with generous limits.""" + options = MetsGbsBackendOptions( + max_file_bytes=10 * 1024 * 1024, # 10 MB + max_total_bytes=300 * 1024 * 1024, # 300 MB + max_member_count=1000, + ) + + in_doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + assert in_doc.valid + doc_backend: MetsGbsDocumentBackend = in_doc._backend + assert doc_backend.is_valid() + assert doc_backend.page_count() == 3 + + page_backend: MetsGbsPageBackend = doc_backend.load_page(0) + assert page_backend.is_valid() + + page_backend.unload() + doc_backend.unload() + + +def test_total_bytes_tracking_across_pages(test_doc_path): + """Test that total bytes are tracked cumulatively across initialization and page loading. + + This test ensures that when max_total_bytes is larger than max_file_bytes, + initialization succeeds but page loading eventually fails due to cumulative limit. + """ + options = MetsGbsBackendOptions( + max_file_bytes=10 * 1024 * 1024, + max_total_bytes=20 * 1024, + max_member_count=1000, + ) + + in_doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.METS_GBS, + backend=MetsGbsDocumentBackend, + backend_options=options, + ) + + assert in_doc.valid + doc_backend: MetsGbsDocumentBackend = in_doc._backend + assert doc_backend.is_valid() + + page_load_failed = False + for page_index in range(doc_backend.page_count()): + try: + page_backend: MetsGbsPageBackend = doc_backend.load_page(page_index) + page_backend.unload() + except ValueError as e: + assert "Total extracted data exceeds maximum limit" in str(e) + page_load_failed = True + break + + assert page_load_failed, "Expected page loading to fail due to total bytes limit" + doc_backend.unload()
tests/test_input_doc.py+10 −0 modified@@ -248,6 +248,16 @@ def test_guess_format(tmp_path): # Plain .txt file (not USPTO) should be detected as Markdown stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) assert dci._guess_format(stream) == InputFormat.MD + + # Valid METS-GBS archive + mets_gbs_path = Path("./tests/data/mets_gbs/32044009881525_select.tar.gz") + if mets_gbs_path.exists(): + assert dci._guess_format(mets_gbs_path) == InputFormat.METS_GBS + + buf = BytesIO(mets_gbs_path.open("rb").read()) + stream = DocumentStream(name="32044009881525_select.tar.gz", stream=buf) + assert dci._guess_format(stream) == InputFormat.METS_GBS + doc_path = temp_dir / "pftaps_wrong.txt" doc_path.write_text("xyz", encoding="utf-8") assert dci._guess_format(doc_path) == InputFormat.MD
Vulnerability mechanics
No source-code context for this CVE — mechanics is only generated when we can read the actual fix diff. Without that, the four sections (root cause, attack vector, affected code, fix) would be speculation rather than analysis.
References
3News mentions
1- Docling Project: Eight High-Severity Vulnerabilities Disclosed TogetherVypr Intelligence · Jun 3, 2026