VYPR
High severity7.5GHSA Advisory· Published Oct 6, 2025· Updated Apr 15, 2026

CVE-2025-6985

CVE-2025-6985

Description

The HTMLSectionSplitter class in langchain-text-splitters version 0.3.8 is vulnerable to XML External Entity (XXE) attacks due to unsafe XSLT parsing. This vulnerability arises because the class allows the use of arbitrary XSLT stylesheets, which are parsed using lxml.etree.parse() and lxml.etree.XSLT() without any hardening measures. In lxml versions up to 4.9.x, external entities are resolved by default, allowing attackers to read arbitrary local files or perform outbound HTTP(S) fetches. In lxml versions 5.0 and above, while entity expansion is disabled, the XSLT document() function can still read any URI unless XSLTAccessControl is applied. This vulnerability allows remote attackers to gain read-only access to any file the LangChain process can reach, including sensitive files such as SSH keys, environment files, source code, or cloud metadata. No authentication, special privileges, or user interaction are required, and the issue is exploitable in default deployments that enable custom XSLT.

Affected packages

Versions sourced from the GitHub Security Advisory.

PackageAffected versionsPatched versions
langchain-text-splittersPyPI
< 0.3.90.3.9

Affected products

1

Patches

1
43eef435505a

security: Remove xslt_path and harden XML parsers in HTMLSectionSplitter: package: langchain-text-splitters (#31819)

https://github.com/langchain-ai/langchainCole MurrayJul 2, 2025via ghsa
3 files changed · +146 47
  • libs/text-splitters/langchain_text_splitters/html.py+16 15 modified
    @@ -309,7 +309,6 @@ class HTMLSectionSplitter:
         def __init__(
             self,
             headers_to_split_on: List[Tuple[str, str]],
    -        xslt_path: Optional[str] = None,
             **kwargs: Any,
         ) -> None:
             """Create a new HTMLSectionSplitter.
    @@ -318,20 +317,13 @@ def __init__(
                 headers_to_split_on: list of tuples of headers we want to track mapped to
                     (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                     h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
    -            xslt_path: path to xslt file for document transformation.
    -            Uses a default if not passed.
    -            Needed for html contents that using different format and layouts.
                 **kwargs (Any): Additional optional arguments for customizations.
     
             """
             self.headers_to_split_on = dict(headers_to_split_on)
    -
    -        if xslt_path is None:
    -            self.xslt_path = (
    -                pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
    -            ).absolute()
    -        else:
    -            self.xslt_path = pathlib.Path(xslt_path).absolute()
    +        self.xslt_path = (
    +            pathlib.Path(__file__).parent / "xsl/converting_to_header.xslt"
    +        ).absolute()
             self.kwargs = kwargs
     
         def split_documents(self, documents: Iterable[Document]) -> List[Document]:
    @@ -457,11 +449,20 @@ def convert_possible_tags_to_header(self, html_content: str) -> str:
                     "Unable to import lxml, please install with `pip install lxml`."
                 ) from e
             # use lxml library to parse html document and return xml ElementTree
    -        parser = etree.HTMLParser()
    -        tree = etree.parse(StringIO(html_content), parser)
    +        # Create secure parsers to prevent XXE attacks
    +        html_parser = etree.HTMLParser(no_network=True)
    +        xslt_parser = etree.XMLParser(
    +            resolve_entities=False, no_network=True, load_dtd=False
    +        )
    +
    +        # Apply XSLT access control to prevent file/network access
    +        # DENY_ALL is a predefined access control that blocks all file/network access
    +        # Type ignore needed due to incomplete lxml type stubs
    +        ac = etree.XSLTAccessControl.DENY_ALL  # type: ignore[attr-defined]
     
    -        xslt_tree = etree.parse(self.xslt_path)
    -        transform = etree.XSLT(xslt_tree)
    +        tree = etree.parse(StringIO(html_content), html_parser)
    +        xslt_tree = etree.parse(self.xslt_path, xslt_parser)
    +        transform = etree.XSLT(xslt_tree, access_control=ac)
             result = transform(tree)
             return str(result)
     
    
  • libs/text-splitters/tests/unit_tests/test_html_security.py+130 0 added
    @@ -0,0 +1,130 @@
    +"""Security tests for HTML splitters to prevent XXE attacks."""
    +
    +import pytest
    +
    +from langchain_text_splitters.html import HTMLSectionSplitter
    +
    +
    +@pytest.mark.requires("lxml", "bs4")
    +class TestHTMLSectionSplitterSecurity:
    +    """Security tests for HTMLSectionSplitter to ensure XXE prevention."""
    +
    +    def test_xxe_entity_attack_blocked(self) -> None:
    +        """Test that external entity attacks are blocked."""
    +        # Create HTML content to process
    +        html_content = """<html><body><p>Test content</p></body></html>"""
    +
    +        # Since xslt_path parameter is removed, this attack vector is eliminated
    +        # The splitter should use only the default XSLT
    +        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
    +
    +        # Process the HTML - should not contain any external entity content
    +        result = splitter.split_text(html_content)
    +
    +        # Verify that no external entity content is present
    +        all_content = " ".join([doc.page_content for doc in result])
    +        assert "root:" not in all_content  # /etc/passwd content
    +        assert "XXE Attack Result" not in all_content
    +
    +    def test_xxe_document_function_blocked(self) -> None:
    +        """Test that XSLT document() function attacks are blocked."""
    +        # Even if someone modifies the default XSLT internally,
    +        # the secure parser configuration should block document() attacks
    +
    +        html_content = (
    +            """<html><body><h1>Test Header</h1><p>Test content</p></body></html>"""
    +        )
    +
    +        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
    +
    +        # Process the HTML safely
    +        result = splitter.split_text(html_content)
    +
    +        # Should process normally without any security issues
    +        assert len(result) > 0
    +        assert any("Test content" in doc.page_content for doc in result)
    +
    +    def test_secure_parser_configuration(self) -> None:
    +        """Test that parsers are configured with security settings."""
    +        # This test verifies our security hardening is in place
    +        html_content = """<html><body><h1>Test</h1></body></html>"""
    +
    +        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
    +
    +        # The convert_possible_tags_to_header method should use secure parsers
    +        result = splitter.convert_possible_tags_to_header(html_content)
    +
    +        # Result should be valid transformed HTML
    +        assert result is not None
    +        assert isinstance(result, str)
    +
    +    def test_no_network_access(self) -> None:
    +        """Test that network access is blocked in parsers."""
    +        # Create HTML that might trigger network access
    +        html_with_external_ref = """<?xml version="1.0"?>
    +<!DOCTYPE html [
    +  <!ENTITY external SYSTEM "http://attacker.com/xxe">
    +]>
    +<html>
    +  <body>
    +    <h1>Test</h1>
    +    <p>&external;</p>
    +  </body>
    +</html>"""
    +
    +        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
    +
    +        # Process the HTML - should not make network requests
    +        result = splitter.split_text(html_with_external_ref)
    +
    +        # Verify no external content is included
    +        all_content = " ".join([doc.page_content for doc in result])
    +        assert "attacker.com" not in all_content
    +
    +    def test_dtd_processing_disabled(self) -> None:
    +        """Test that DTD processing is disabled."""
    +        # HTML with DTD that attempts to define entities
    +        html_with_dtd = """<!DOCTYPE html [
    +  <!ELEMENT html (body)>
    +  <!ELEMENT body (h1, p)>
    +  <!ELEMENT h1 (#PCDATA)>
    +  <!ELEMENT p (#PCDATA)>
    +  <!ENTITY test "This is a test entity">
    +]>
    +<html>
    +  <body>
    +    <h1>Header</h1>
    +    <p>&test;</p>
    +  </body>
    +</html>"""
    +
    +        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
    +
    +        # Process the HTML - entities should not be resolved
    +        result = splitter.split_text(html_with_dtd)
    +
    +        # The entity should not be expanded
    +        all_content = " ".join([doc.page_content for doc in result])
    +        assert "This is a test entity" not in all_content
    +
    +    def test_safe_default_xslt_usage(self) -> None:
    +        """Test that the default XSLT file is used safely."""
    +        # Test with HTML that has font-size styling (what the default XSLT handles)
    +        html_with_font_size = """<html>
    +<body>
    +    <span style="font-size: 24px;">Large Header</span>
    +    <p>Content under large text</p>
    +    <span style="font-size: 18px;">Small Header</span>
    +    <p>Content under small text</p>
    +</body>
    +</html>"""
    +
    +        splitter = HTMLSectionSplitter(headers_to_split_on=[("h1", "Header 1")])
    +
    +        # Process the HTML using the default XSLT
    +        result = splitter.split_text(html_with_font_size)
    +
    +        # Should successfully process the content
    +        assert len(result) > 0
    +        # Large font text should be converted to header
    +        assert any("Large Header" in str(doc.metadata.values()) for doc in result)
    
  • libs/text-splitters/tests/unit_tests/test_text_splitters.py+0 32 modified
    @@ -3,7 +3,6 @@
     import random
     import re
     import string
    -from pathlib import Path
     from typing import Any, Callable, List, Tuple
     
     import pytest
    @@ -2865,37 +2864,6 @@ def test_happy_path_splitting_based_on_header_with_whitespace_chars() -> None:
         assert docs[2].metadata["Header 2"] == "Baz"
     
     
    -@pytest.mark.requires("bs4")
    -@pytest.mark.requires("lxml")
    -def test_section_splitter_accepts_a_relative_path() -> None:
    -    html_string = """<html><body><p>Foo</p></body></html>"""
    -    test_file = Path("tests/test_data/test_splitter.xslt")
    -    assert test_file.is_file()
    -
    -    sec_splitter = HTMLSectionSplitter(
    -        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
    -        xslt_path=test_file.as_posix(),
    -    )
    -
    -    sec_splitter.split_text(html_string)
    -
    -
    -@pytest.mark.requires("bs4")
    -@pytest.mark.requires("lxml")
    -def test_section_splitter_accepts_an_absolute_path() -> None:
    -    html_string = """<html><body><p>Foo</p></body></html>"""
    -    test_file = Path("tests/test_data/test_splitter.xslt").absolute()
    -    assert test_file.is_absolute()
    -    assert test_file.is_file()
    -
    -    sec_splitter = HTMLSectionSplitter(
    -        headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2")],
    -        xslt_path=test_file.as_posix(),
    -    )
    -
    -    sec_splitter.split_text(html_string)
    -
    -
     @pytest.mark.requires("bs4")
     @pytest.mark.requires("lxml")
     def test_happy_path_splitting_with_duplicate_header_tag() -> None:
    

Vulnerability mechanics

Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.

References

5

News mentions

0

No linked articles in our index yet.