VYPR
High severity7.5NVD Advisory· Published Jun 3, 2026

Docling: Unsafe Zip Extraction in EasyOCR Model Download

CVE-2026-44017

Description

Impact

In versions < 2.91.0, The EasyOCR model download functionality extracted ZIP archives without validating member paths, enabling Zip Slip attacks. If an attacker could compromise the model download source (via supply chain attack, DNS spoofing, or MITM), they could write arbitrary files to any location writable by the process, potentially achieving: - Remote code execution by overwriting Python files or system binaries - Persistent backdoors by modifying startup scripts or SSH keys - Data corruption or system compromise

Patches

Fixed in version 2.91.0. The extraction process now validates each archive member path using os.path.realpath() to ensure it remains within the target directory, raising a SecurityError for any path traversal attempts.

Workarounds

Ensure model downloads occur over secure, authenticated channels. Use integrity verification (checksums) for downloaded models. Run the application with minimal file system permissions.

### References - Fix release: v2.91.0

Affected products

1

Patches

2
5e161ac18548

fix: EasyOCR model downloading (#3339)

https://github.com/docling-project/doclingNikos LivathinosApr 23, 2026Fixed in 2.91.0via llm-release-walk
2 files changed · +13 1
  • docling/exceptions.py+4 0 modified
    @@ -8,3 +8,7 @@ class ConversionError(BaseError):
     
     class OperationNotAllowed(BaseError):
         pass
    +
    +
    +class SecurityError(BaseError):
    +    pass
    
  • docling/models/stages/ocr/easyocr_model.py+9 1 modified
    @@ -1,4 +1,5 @@
     import logging
    +import os
     import warnings
     import zipfile
     from collections.abc import Iterable
    @@ -17,6 +18,7 @@
         OcrOptions,
     )
     from docling.datamodel.settings import settings
    +from docling.exceptions import SecurityError
     from docling.models.base_ocr_model import BaseOcrModel
     from docling.utils.accelerator_utils import decide_device
     from docling.utils.profiling import TimeRecorder
    @@ -122,7 +124,13 @@ def download_models(
             for model_details in download_list:
                 buf = download_url_with_progress(model_details["url"], progress=progress)
                 with zipfile.ZipFile(buf, "r") as zip_ref:
    -                zip_ref.extractall(local_dir)
    +                for member in zip_ref.infolist():
    +                    member_path = os.path.realpath(
    +                        os.path.join(local_dir, member.filename)
    +                    )
    +                    if not member_path.startswith(os.path.realpath(local_dir) + os.sep):
    +                        raise SecurityError(f"ZIP slip attempt: {member.filename}")
    +                    zip_ref.extract(member, local_dir)
     
             return local_dir
     
    
c1dbac22c7a0

fix: strengthen input validation for METS‑GBS processing (#3336)

https://github.com/docling-project/doclingCesar Berrospi RamisApr 23, 2026Fixed in 2.91.0via llm-release-walk
6 files changed · +252 22
  • docling/backend/mets_gbs_backend.py+64 9 modified
    @@ -21,8 +21,8 @@
     from PIL import Image
     from PIL.Image import Image as PILImage
     
    -from docling.backend.abstract_backend import PaginatedDocumentBackend
     from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
    +from docling.datamodel.backend_options import MetsGbsBackendOptions
     from docling.datamodel.base_models import InputFormat
     
     if TYPE_CHECKING:
    @@ -195,22 +195,46 @@ def _extract_confidence(title_str) -> float:
     
     
     class MetsGbsDocumentBackend(PdfDocumentBackend):
    -    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
    -        super().__init__(in_doc, path_or_stream)
    -
    +    def __init__(
    +        self,
    +        in_doc: "InputDocument",
    +        path_or_stream: Union[BytesIO, Path],
    +        options: MetsGbsBackendOptions = MetsGbsBackendOptions(),
    +    ):
    +        super().__init__(in_doc, path_or_stream, options)
    +        self.options: MetsGbsBackendOptions
             self._tar: tarfile.TarFile = (
                 tarfile.open(name=self.path_or_stream, mode="r:gz")
                 if isinstance(self.path_or_stream, Path)
                 else tarfile.open(fileobj=self.path_or_stream, mode="r:gz")
             )
             self.root_mets: Optional[etree._Element] = None
             self.page_map: Dict[int, _PageFiles] = {}
    +        self._total_bytes_extracted = 0
    +        member_count = 0
     
             for member in self._tar.getmembers():
    +            member_count += 1
    +            if member_count > self.options.max_member_count:
    +                raise ValueError(
    +                    f"Archive exceeds maximum member count limit of {self.options.max_member_count}"
    +                )
    +
                 if member.name.endswith(".xml"):
                     file = self._tar.extractfile(member)
                     if file is not None:
    -                    content = file.read()
    +                    content = file.read(self.options.max_file_bytes + 1)
    +                    if len(content) > self.options.max_file_bytes:
    +                        raise ValueError(
    +                            f"XML file {member.name} exceeds size limit of {self.options.max_file_bytes} bytes"
    +                        )
    +
    +                    self._total_bytes_extracted += len(content)
    +                    if self._total_bytes_extracted > self.options.max_total_bytes:
    +                        raise ValueError(
    +                            f"Archive exceeds maximum total extraction size of {self.options.max_total_bytes} bytes"
    +                        )
    +
                         self.root_mets = self._validate_mets_xml(content)
                         if self.root_mets is not None:
                             break
    @@ -283,7 +307,11 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
                 self.page_map[page_no] = page_files
     
         def _validate_mets_xml(self, xml_string) -> Optional[etree._Element]:
    -        root: etree._Element = etree.fromstring(xml_string)
    +        # Security: disable entity resolution
    +        parser = etree.XMLParser(
    +            resolve_entities=False, load_dtd=False, no_network=True
    +        )
    +        root: etree._Element = etree.fromstring(xml_string, parser=parser)
             if (
                 root.tag == "{http://www.loc.gov/METS/}mets"
                 and root.get("PROFILE") == "gbs"
    @@ -300,14 +328,41 @@ def _parse_page(self, page_no: int) -> Tuple[SegmentedPdfPage, PILImage]:
             ocr_info = self.page_map[page_no].coordOCR
             assert ocr_info is not None
     
    +        # Security: limit extraction size to prevent decompression bombs
             image_file = self._tar.extractfile(image_info.path)
             assert image_file is not None
    -        buf = BytesIO(image_file.read())
    +        image_data = image_file.read(self.options.max_file_bytes + 1)
    +        if len(image_data) > self.options.max_file_bytes:
    +            raise ValueError(
    +                f"Image file {image_info.path} exceeds individual file size limit of {self.options.max_file_bytes} bytes"
    +            )
    +
    +        # Security: Track total bytes extracted
    +        self._total_bytes_extracted += len(image_data)
    +        if self._total_bytes_extracted > self.options.max_total_bytes:
    +            raise ValueError(
    +                f"Total extracted data exceeds maximum limit of {self.options.max_total_bytes} bytes"
    +            )
    +
    +        buf = BytesIO(image_data)
             im: PILImage = Image.open(buf)
    +
             ocr_file = self._tar.extractfile(ocr_info.path)
             assert ocr_file is not None
    -        ocr_content = ocr_file.read()
    -        parser = etree.HTMLParser()
    +        ocr_content = ocr_file.read(self.options.max_file_bytes + 1)
    +        if len(ocr_content) > self.options.max_file_bytes:
    +            raise ValueError(
    +                f"OCR file {ocr_info.path} exceeds individual file size limit of {self.options.max_file_bytes} bytes"
    +            )
    +
    +        # Security: Track total bytes extracted
    +        self._total_bytes_extracted += len(ocr_content)
    +        if self._total_bytes_extracted > self.options.max_total_bytes:
    +            raise ValueError(
    +                f"Total extracted data exceeds maximum limit of {self.options.max_total_bytes} bytes"
    +            )
    +
    +        parser = etree.HTMLParser(no_network=True)
             ocr_root: etree._Element = etree.fromstring(ocr_content, parser=parser)
     
             line_cells: List[TextCell] = []
    
  • docling/datamodel/backend_options.py+22 0 modified
    @@ -127,6 +127,27 @@ class PdfBackendOptions(BaseBackendOptions):
         password: Optional[SecretStr] = None
     
     
    +class MetsGbsBackendOptions(PdfBackendOptions):
    +    """Options specific to the METS-GBS document backend."""
    +
    +    kind: Annotated[Literal["mets-gbs"], Field(exclude=True, repr=False)] = "mets-gbs"  # type: ignore[assignment]
    +    max_total_bytes: Annotated[
    +        PositiveInt,
    +        Field(
    +            description="Maximum cumulative size in bytes of all data extracted from the archive during processing"
    +        ),
    +    ] = 300 * 1024 * 1024
    +    max_file_bytes: Annotated[
    +        PositiveInt,
    +        Field(
    +            description="Maximum size in bytes for any single file extracted from the archive"
    +        ),
    +    ] = 10 * 1024 * 1024
    +    max_member_count: Annotated[
    +        PositiveInt, Field(description="Maximum number of archive members to process")
    +    ] = 1000
    +
    +
     class MsExcelBackendOptions(BaseBackendOptions):
         """Options specific to the MS Excel backend."""
     
    @@ -194,6 +215,7 @@ class XBRLBackendOptions(BaseBackendOptions):
             HTMLBackendOptions,
             MarkdownBackendOptions,
             PdfBackendOptions,
    +        MetsGbsBackendOptions,
             MsExcelBackendOptions,
             LatexBackendOptions,
             XBRLBackendOptions,
    
  • docling/datamodel/document.py+39 13 modified
    @@ -61,7 +61,7 @@
         DeclarativeDocumentBackend,
         PaginatedDocumentBackend,
     )
    -from docling.datamodel.backend_options import BackendOptions
    +from docling.datamodel.backend_options import BackendOptions, MetsGbsBackendOptions
     from docling.datamodel.base_models import (
         AssembledUnit,
         ConfidenceReport,
    @@ -750,19 +750,45 @@ def _detect_csv(
         def _detect_mets_gbs(
             obj: Union[Path, DocumentStream],
         ) -> Optional[Literal["application/mets+xml"]]:
    +        # Use default limits for safe format detection
    +        default_options = MetsGbsBackendOptions()
    +        max_file_bytes = default_options.max_file_bytes
    +        max_member_count = default_options.max_member_count
    +
             content = obj if isinstance(obj, Path) else obj.stream
             tar: tarfile.TarFile
             member: tarfile.TarInfo
    -        with tarfile.open(
    -            name=content if isinstance(content, Path) else None,
    -            fileobj=content if isinstance(content, BytesIO) else None,
    -            mode="r:gz",
    -        ) as tar:
    -            for member in tar.getmembers():
    -                if member.name.endswith(".xml"):
    -                    file = tar.extractfile(member)
    -                    if file is not None:
    -                        content_str = file.read().decode(errors="ignore")
    -                        if "http://www.loc.gov/METS/" in content_str:
    -                            return "application/mets+xml"
    +        member_count = 0
    +
    +        try:
    +            with tarfile.open(
    +                name=content if isinstance(content, Path) else None,
    +                fileobj=content if isinstance(content, BytesIO) else None,
    +                mode="r:gz",
    +            ) as tar:
    +                for member in tar.getmembers():
    +                    member_count += 1
    +                    if member_count > max_member_count:
    +                        _log.warning(
    +                            f"Archive exceeds member count limit ({max_member_count}) during format detection"
    +                        )
    +                        return None
    +
    +                    if member.name.endswith(".xml"):
    +                        file = tar.extractfile(member)
    +                        if file is not None:
    +                            xml_content = file.read(max_file_bytes + 1)
    +                            if len(xml_content) > max_file_bytes:
    +                                _log.warning(
    +                                    f"XML file {member.name} exceeds size limit ({max_file_bytes} bytes) during format detection"
    +                                )
    +                                continue
    +
    +                            content_str = xml_content.decode(errors="ignore")
    +                            if "http://www.loc.gov/METS/" in content_str:
    +                                return "application/mets+xml"
    +        except Exception as e:
    +            _log.warning(f"Error during METS-GBS format detection: {e}")
    +            return None
    +
             return None
    
  • docling/document_converter.py+7 0 modified
    @@ -40,6 +40,7 @@
         HTMLBackendOptions,
         LatexBackendOptions,
         MarkdownBackendOptions,
    +    MetsGbsBackendOptions,
         PdfBackendOptions,
         XBRLBackendOptions,
     )
    @@ -150,6 +151,12 @@ class PdfFormatOption(FormatOption):
         backend_options: Optional[PdfBackendOptions] = None
     
     
    +class MetsGbsFormatOption(FormatOption):
    +    pipeline_cls: Type = StandardPdfPipeline
    +    backend: Type[AbstractDocumentBackend] = MetsGbsDocumentBackend
    +    backend_options: MetsGbsBackendOptions | None = None
    +
    +
     class AudioFormatOption(FormatOption):
         pipeline_cls: Type = AsrPipeline
         backend: Type[AbstractDocumentBackend] = NoOpBackend
    
  • tests/test_backend_mets_gbs.py+110 0 modified
    @@ -3,6 +3,7 @@
     import pytest
     
     from docling.backend.mets_gbs_backend import MetsGbsDocumentBackend, MetsGbsPageBackend
    +from docling.datamodel.backend_options import MetsGbsBackendOptions
     from docling.datamodel.base_models import BoundingBox, InputFormat
     from docling.datamodel.document import InputDocument
     
    @@ -75,3 +76,112 @@ def test_num_pages(test_doc_path):
     
         # Explicitly clean up resources to prevent race conditions in CI
         doc_backend.unload()
    +
    +
    +def test_max_file_bytes_limit(test_doc_path):
    +    """Test that max_file_bytes limit is enforced during extraction."""
    +
    +    options = MetsGbsBackendOptions(max_file_bytes=100)
    +
    +    with pytest.raises(ValueError, match=r"exceeds.*size limit"):
    +        InputDocument(
    +            path_or_stream=test_doc_path,
    +            format=InputFormat.METS_GBS,
    +            backend=MetsGbsDocumentBackend,
    +            backend_options=options,
    +        )
    +
    +
    +def test_max_total_bytes_limit(test_doc_path):
    +    """Test that max_total_bytes limit is enforced across all extractions."""
    +
    +    options = MetsGbsBackendOptions(
    +        max_file_bytes=10 * 1024 * 1024,
    +        max_total_bytes=1000,
    +    )
    +
    +    with pytest.raises(ValueError, match="exceeds maximum total extraction size"):
    +        InputDocument(
    +            path_or_stream=test_doc_path,
    +            format=InputFormat.METS_GBS,
    +            backend=MetsGbsDocumentBackend,
    +            backend_options=options,
    +        )
    +
    +
    +def test_max_member_count_limit(test_doc_path):
    +    """Test that max_member_count limit is enforced during extraction."""
    +
    +    options = MetsGbsBackendOptions(max_member_count=2)
    +
    +    with pytest.raises(ValueError, match="exceeds maximum member count limit"):
    +        InputDocument(
    +            path_or_stream=test_doc_path,
    +            format=InputFormat.METS_GBS,
    +            backend=MetsGbsDocumentBackend,
    +            backend_options=options,
    +        )
    +
    +
    +def test_limits_with_valid_values(test_doc_path):
    +    """Test that processing succeeds with generous limits."""
    +    options = MetsGbsBackendOptions(
    +        max_file_bytes=10 * 1024 * 1024,  # 10 MB
    +        max_total_bytes=300 * 1024 * 1024,  # 300 MB
    +        max_member_count=1000,
    +    )
    +
    +    in_doc = InputDocument(
    +        path_or_stream=test_doc_path,
    +        format=InputFormat.METS_GBS,
    +        backend=MetsGbsDocumentBackend,
    +        backend_options=options,
    +    )
    +
    +    assert in_doc.valid
    +    doc_backend: MetsGbsDocumentBackend = in_doc._backend
    +    assert doc_backend.is_valid()
    +    assert doc_backend.page_count() == 3
    +
    +    page_backend: MetsGbsPageBackend = doc_backend.load_page(0)
    +    assert page_backend.is_valid()
    +
    +    page_backend.unload()
    +    doc_backend.unload()
    +
    +
    +def test_total_bytes_tracking_across_pages(test_doc_path):
    +    """Test that total bytes are tracked cumulatively across initialization and page loading.
    +
    +    This test ensures that when max_total_bytes is larger than max_file_bytes,
    +    initialization succeeds but page loading eventually fails due to cumulative limit.
    +    """
    +    options = MetsGbsBackendOptions(
    +        max_file_bytes=10 * 1024 * 1024,
    +        max_total_bytes=20 * 1024,
    +        max_member_count=1000,
    +    )
    +
    +    in_doc = InputDocument(
    +        path_or_stream=test_doc_path,
    +        format=InputFormat.METS_GBS,
    +        backend=MetsGbsDocumentBackend,
    +        backend_options=options,
    +    )
    +
    +    assert in_doc.valid
    +    doc_backend: MetsGbsDocumentBackend = in_doc._backend
    +    assert doc_backend.is_valid()
    +
    +    page_load_failed = False
    +    for page_index in range(doc_backend.page_count()):
    +        try:
    +            page_backend: MetsGbsPageBackend = doc_backend.load_page(page_index)
    +            page_backend.unload()
    +        except ValueError as e:
    +            assert "Total extracted data exceeds maximum limit" in str(e)
    +            page_load_failed = True
    +            break
    +
    +    assert page_load_failed, "Expected page loading to fail due to total bytes limit"
    +    doc_backend.unload()
    
  • tests/test_input_doc.py+10 0 modified
    @@ -248,6 +248,16 @@ def test_guess_format(tmp_path):
         # Plain .txt file (not USPTO) should be detected as Markdown
         stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
         assert dci._guess_format(stream) == InputFormat.MD
    +
    +    # Valid METS-GBS archive
    +    mets_gbs_path = Path("./tests/data/mets_gbs/32044009881525_select.tar.gz")
    +    if mets_gbs_path.exists():
    +        assert dci._guess_format(mets_gbs_path) == InputFormat.METS_GBS
    +
    +        buf = BytesIO(mets_gbs_path.open("rb").read())
    +        stream = DocumentStream(name="32044009881525_select.tar.gz", stream=buf)
    +        assert dci._guess_format(stream) == InputFormat.METS_GBS
    +
         doc_path = temp_dir / "pftaps_wrong.txt"
         doc_path.write_text("xyz", encoding="utf-8")
         assert dci._guess_format(doc_path) == InputFormat.MD
    

Vulnerability mechanics

No source-code context for this CVE — mechanics is only generated when we can read the actual fix diff. Without that, the four sections (root cause, attack vector, affected code, fix) would be speculation rather than analysis.

References

3

News mentions

1