HTML Cleaner allows crafted and SVG embedded scripts to pass through
Description
lxml is a library for processing XML and HTML in the Python language. Prior to version 4.6.5, the HTML Cleaner in lxml.html lets certain crafted script content pass through, as well as script content in SVG files embedded using data URIs. Users that employ the HTML cleaner in a security relevant context should upgrade to lxml 4.6.5 to receive a patch. There are no known workarounds available.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
lxmlPyPI | < 4.6.5 | 4.6.5 |
Affected products
1Patches
33 files changed · +9 −5
CHANGES.txt+1 −1 modified@@ -2,7 +2,7 @@ lxml changelog ============== -4.6.5 (2021-12-??) +4.6.5 (2021-12-12) ================== Bugs fixed
doc/main.txt+7 −3 modified@@ -159,8 +159,8 @@ Index <http://pypi.python.org/pypi/lxml/>`_ (PyPI). It has the source that compiles on various platforms. The source distribution is signed with `this key <pubkey.asc>`_. -The latest version is `lxml 4.6.4`_, released 2021-11-01 -(`changes for 4.6.4`_). `Older versions <#old-versions>`_ +The latest version is `lxml 4.6.5`_, released 2021-12-12 +(`changes for 4.6.5`_). `Older versions <#old-versions>`_ are listed below. Please take a look at the @@ -256,7 +256,9 @@ See the websites of lxml .. and the `latest in-development version <http://lxml.de/dev/>`_. -.. _`PDF documentation`: lxmldoc-4.6.4.pdf +.. _`PDF documentation`: lxmldoc-4.6.5.pdf + +* `lxml 4.6.5`_, released 2021-12-12 (`changes for 4.6.5`_) * `lxml 4.6.4`_, released 2021-11-01 (`changes for 4.6.4`_) @@ -284,6 +286,7 @@ See the websites of lxml * `older releases <http://lxml.de/4.3/#old-versions>`_ +.. _`lxml 4.6.5`: /files/lxml-4.6.5.tgz .. _`lxml 4.6.4`: /files/lxml-4.6.4.tgz .. _`lxml 4.6.3`: /files/lxml-4.6.3.tgz .. _`lxml 4.6.2`: /files/lxml-4.6.2.tgz @@ -297,6 +300,7 @@ See the websites of lxml .. _`lxml 4.4.1`: /files/lxml-4.4.1.tgz .. _`lxml 4.4.0`: /files/lxml-4.4.0.tgz +.. _`changes for 4.6.5`: /changes-4.6.5.html .. _`changes for 4.6.4`: /changes-4.6.4.html .. _`changes for 4.6.3`: /changes-4.6.3.html .. _`changes for 4.6.2`: /changes-4.6.2.html
src/lxml/__init__.py+1 −1 modified@@ -1,6 +1,6 @@ # this is a package -__version__ = "4.6.4" +__version__ = "4.6.5" def get_include():
f2330237440dCleaner: Remove SVG image data URLs since they can embed script content.
2 files changed · +60 −8
src/lxml/html/clean.py+15 −8 modified@@ -75,18 +75,25 @@ # All kinds of schemes besides just javascript: that can cause # execution: -_is_image_dataurl = re.compile( - r'^data:image/.+;base64', re.I).search +_find_image_dataurls = re.compile( + r'^data:image/(.+);base64,', re.I).findall _is_possibly_malicious_scheme = re.compile( - r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', - re.I).search + r'(javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).findall +# SVG images can contain script content +_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall + def _is_javascript_scheme(s): - if _is_image_dataurl(s): - return None - return _is_possibly_malicious_scheme(s) + is_image_url = False + for image_type in _find_image_dataurls(s): + is_image_url = True + if _is_unsafe_image_type(image_type): + return True + if is_image_url: + return False + return bool(_is_possibly_malicious_scheme(s)) _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub -# FIXME: should data: be blocked? # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx _conditional_comment_re = re.compile(
src/lxml/html/tests/test_clean.py+45 −0 modified@@ -1,3 +1,5 @@ +import base64 +import gzip import unittest from lxml.tests.common_imports import make_doctest @@ -143,6 +145,49 @@ def test_sneaky_import_in_style(self): cleaned, "%s -> %s" % (style_code, cleaned)) + def test_svg_data_links(self): + # Remove SVG images with potentially insecure content. + svg = b'<svg onload="alert(123)" />' + svgz = gzip.compress(svg) + svg_b64 = base64.b64encode(svg).decode('ASCII') + svgz_b64 = base64.b64encode(svgz).decode('ASCII') + urls = [ + "data:image/svg+xml;base64," + svg_b64, + "data:image/svg+xml-compressed;base64," + svgz_b64, + ] + for url in urls: + html = '<img src="%s">' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<img src="">', + cleaned, + "%s -> %s" % (url, cleaned)) + + def test_image_data_links(self): + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + urls = [ + "data:image/jpeg;base64," + data_b64, + "data:image/apng;base64," + data_b64, + "data:image/png;base64," + data_b64, + "data:image/gif;base64," + data_b64, + "data:image/webp;base64," + data_b64, + "data:image/bmp;base64," + data_b64, + "data:image/tiff;base64," + data_b64, + "data:image/x-icon;base64," + data_b64, + ] + for url in urls: + html = '<img src="%s">' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (url, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute
12fa96690071Cleaner: Prevent "@import" from re-occurring in the CSS after replacements, e.g. "@@importimport".
2 files changed · +22 −0
src/lxml/html/clean.py+2 −0 modified@@ -541,6 +541,8 @@ def _has_sneaky_javascript(self, style): return True if 'expression(' in style: return True + if '@import' in style: + return True if '</noscript' in style: # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">' return True
src/lxml/html/tests/test_clean.py+20 −0 modified@@ -123,6 +123,26 @@ def test_sneaky_js_in_math_style(self): b'<math><style>/* deleted */</style></math>', lxml.html.tostring(clean_html(s))) + def test_sneaky_import_in_style(self): + # Prevent "@@importimport" -> "@import" replacement. + style_codes = [ + "@@importimport(extstyle.css)", + "@ @ import import(extstyle.css)", + "@ @ importimport(extstyle.css)", + "@@ import import(extstyle.css)", + "@ @import import(extstyle.css)", + "@@importimport()", + ] + for style_code in style_codes: + html = '<style>%s</style>' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'<style>/* deleted */</style>', + cleaned, + "%s -> %s" % (style_code, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute
Vulnerability mechanics
Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.
References
22- github.com/advisories/GHSA-55x5-fj6c-h6m8ghsaADVISORY
- lists.fedoraproject.org/archives/list/package-announce%40lists.fedoraproject.org/message/TUIS2KE3HZ2AAQKXFLTJFZPP2IFHJTC7/mitrevendor-advisoryx_refsource_FEDORA
- lists.fedoraproject.org/archives/list/package-announce%40lists.fedoraproject.org/message/V2XMOM5PFT6U5AAXY6EFNT5JZCKKHK2V/mitrevendor-advisoryx_refsource_FEDORA
- lists.fedoraproject.org/archives/list/package-announce%40lists.fedoraproject.org/message/WZGNET2A4WGLSUXLBFYKNC5PXHQMI3I7/mitrevendor-advisoryx_refsource_FEDORA
- lists.fedoraproject.org/archives/list/package-announce%40lists.fedoraproject.org/message/ZQ4SPKJX3RRJK4UWA6FXCRHD2TVRQI44/mitrevendor-advisoryx_refsource_FEDORA
- nvd.nist.gov/vuln/detail/CVE-2021-43818ghsaADVISORY
- security.gentoo.org/glsa/202208-06ghsavendor-advisoryx_refsource_GENTOOWEB
- www.debian.org/security/2022/dsa-5043ghsavendor-advisoryx_refsource_DEBIANWEB
- github.com/lxml/lxml/commit/12fa9669007180a7bb87d990c375cf91ca5b664aghsax_refsource_MISCWEB
- github.com/lxml/lxml/commit/a3eacbc0dcf1de1c822ec29fb7d090a4b1712a9cghsax_refsource_MISCWEB
- github.com/lxml/lxml/commit/f2330237440df7e8f39c3ad1b1aa8852be3b27c0ghsax_refsource_MISCWEB
- github.com/lxml/lxml/security/advisories/GHSA-55x5-fj6c-h6m8ghsax_refsource_CONFIRMWEB
- github.com/pypa/advisory-database/tree/main/vulns/lxml/PYSEC-2021-852.yamlghsaWEB
- lists.debian.org/debian-lts-announce/2021/12/msg00037.htmlghsamailing-listx_refsource_MLISTWEB
- lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/TUIS2KE3HZ2AAQKXFLTJFZPP2IFHJTC7ghsaWEB
- lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/V2XMOM5PFT6U5AAXY6EFNT5JZCKKHK2VghsaWEB
- lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/WZGNET2A4WGLSUXLBFYKNC5PXHQMI3I7ghsaWEB
- lists.fedoraproject.org/archives/list/package-announce@lists.fedoraproject.org/message/ZQ4SPKJX3RRJK4UWA6FXCRHD2TVRQI44ghsaWEB
- security.netapp.com/advisory/ntap-20220107-0005ghsaWEB
- security.netapp.com/advisory/ntap-20220107-0005/mitrex_refsource_CONFIRM
- www.oracle.com/security-alerts/cpuapr2022.htmlghsax_refsource_MISCWEB
- www.oracle.com/security-alerts/cpujul2022.htmlghsax_refsource_MISCWEB
News mentions
0No linked articles in our index yet.