VYPR
Medium severity4.3NVD Advisory· Published Jun 17, 2025· Updated Apr 15, 2026

CVE-2025-6069

CVE-2025-6069

Description

The html.parser.HTMLParser class had worse-case quadratic complexity when processing certain crafted malformed inputs potentially leading to amplified denial-of-service.

Patches

7
ab0893fd5c57

[3.12] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135483)

https://github.com/python/cpythonSerhiy StorchakaJul 3, 2025via osv
3 files changed · +116 23
  • Lib/html/parser.py+30 11 modified
    @@ -25,6 +25,7 @@
     charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -177,25 +178,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+82 12 modified
    @@ -5,6 +5,7 @@
     import unittest
     
     from unittest.mock import patch
    +from test import support
     
     
     class EventCollector(html.parser.HTMLParser):
    @@ -393,28 +394,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -539,13 +546,56 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_broken_comments(self):
    -        html = ('<! not really a comment >'
    +    def test_eof_in_comments(self):
    +        data = [
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_bogus_comments(self):
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
                     '<!!! another bogus comment !!!>')
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -600,6 +650,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    
8d1b3dfa0913

[3.9] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135486)

https://github.com/python/cpythonSerhiy StorchakaJul 3, 2025via osv
3 files changed · +117 23
  • Lib/html/parser.py+30 11 modified
    @@ -25,6 +25,7 @@
     charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -176,25 +177,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+83 12 modified
    @@ -4,6 +4,8 @@
     import pprint
     import unittest
     
    +from test import support
    +
     
     class EventCollector(html.parser.HTMLParser):
     
    @@ -391,28 +393,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -537,13 +545,56 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_broken_comments(self):
    -        html = ('<! not really a comment >'
    +    def test_eof_in_comments(self):
    +        data = [
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_bogus_comments(self):
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
                     '<!!! another bogus comment !!!>')
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    
fdc9d214c01c

[3.10] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135485)

https://github.com/python/cpythonSerhiy StorchakaJul 3, 2025via osv
3 files changed · +117 23
  • Lib/html/parser.py+30 11 modified
    @@ -25,6 +25,7 @@
     charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -176,25 +177,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+83 12 modified
    @@ -4,6 +4,8 @@
     import pprint
     import unittest
     
    +from test import support
    +
     
     class EventCollector(html.parser.HTMLParser):
     
    @@ -391,28 +393,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -537,13 +545,56 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_broken_comments(self):
    -        html = ('<! not really a comment >'
    +    def test_eof_in_comments(self):
    +        data = [
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_bogus_comments(self):
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
                     '<!!! another bogus comment !!!>')
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    
f3c6f882cddc

[3.11] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135484)

https://github.com/python/cpythonSerhiy StorchakaJul 3, 2025via osv
3 files changed · +117 23
  • Lib/html/parser.py+30 11 modified
    @@ -25,6 +25,7 @@
     charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -176,25 +177,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+83 12 modified
    @@ -4,6 +4,8 @@
     import pprint
     import unittest
     
    +from test import support
    +
     
     class EventCollector(html.parser.HTMLParser):
     
    @@ -391,28 +393,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -537,13 +545,56 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_broken_comments(self):
    -        html = ('<! not really a comment >'
    +    def test_eof_in_comments(self):
    +        data = [
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
    +    def test_bogus_comments(self):
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
                     '<!!! another bogus comment !!!>')
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -598,6 +649,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    
d851f8e258c7

[3.14] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135481)

https://github.com/python/cpythonMiss Islington (bot)Jun 13, 2025via osv
3 files changed · +111 31
  • Lib/html/parser.py+30 11 modified
    @@ -27,6 +27,7 @@
     attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -195,25 +196,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+77 20 modified
    @@ -5,6 +5,7 @@
     import unittest
     
     from unittest.mock import patch
    +from test import support
     
     
     class EventCollector(html.parser.HTMLParser):
    @@ -430,28 +431,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -576,21 +583,50 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_EOF_in_comments_or_decls(self):
    +    def test_eof_in_comments(self):
             data = [
    -            ('<!', [('data', '<!')]),
    -            ('<!-', [('data', '<!-')]),
    -            ('<!--', [('data', '<!--')]),
    -            ('<![', [('data', '<![')]),
    -            ('<![CDATA[', [('data', '<![CDATA[')]),
    -            ('<![CDATA[x', [('data', '<![CDATA[x')]),
    -            ('<!DOCTYPE', [('data', '<!DOCTYPE')]),
    -            ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
             ]
             for html, expected in data:
                 self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
         def test_bogus_comments(self):
    -        html = ('<! not really a comment >'
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
    @@ -604,6 +640,7 @@ def test_bogus_comments(self):
                     '<![CDATA]]>'  # required '[' after CDATA
             )
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    
4455cbabf991

[3.13] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135482)

https://github.com/python/cpythonMiss Islington (bot)Jun 13, 2025via osv
3 files changed · +111 31
  • Lib/html/parser.py+30 11 modified
    @@ -27,6 +27,7 @@
     attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -195,25 +196,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+77 20 modified
    @@ -5,6 +5,7 @@
     import unittest
     
     from unittest.mock import patch
    +from test import support
     
     
     class EventCollector(html.parser.HTMLParser):
    @@ -430,28 +431,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -576,21 +583,50 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_EOF_in_comments_or_decls(self):
    +    def test_eof_in_comments(self):
             data = [
    -            ('<!', [('data', '<!')]),
    -            ('<!-', [('data', '<!-')]),
    -            ('<!--', [('data', '<!--')]),
    -            ('<![', [('data', '<![')]),
    -            ('<![CDATA[', [('data', '<![CDATA[')]),
    -            ('<![CDATA[x', [('data', '<![CDATA[x')]),
    -            ('<!DOCTYPE', [('data', '<!DOCTYPE')]),
    -            ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
             ]
             for html, expected in data:
                 self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
         def test_bogus_comments(self):
    -        html = ('<! not really a comment >'
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
    @@ -604,6 +640,7 @@ def test_bogus_comments(self):
                     '<![CDATA]]>'  # required '[' after CDATA
             )
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    
6eb6c5dbfb52

gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464)

https://github.com/python/cpythonSerhiy StorchakaJun 13, 2025via osv
3 files changed · +111 31
  • Lib/html/parser.py+30 11 modified
    @@ -27,6 +27,7 @@
     attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
     
     starttagopen = re.compile('<[a-zA-Z]')
    +endtagopen = re.compile('</[a-zA-Z]')
     piclose = re.compile('>')
     commentclose = re.compile(r'--\s*>')
     # Note:
    @@ -195,25 +196,43 @@ def goahead(self, end):
                         k = self.parse_pi(i)
                     elif startswith("<!", i):
                         k = self.parse_html_declaration(i)
    -                elif (i + 1) < n:
    +                elif (i + 1) < n or end:
                         self.handle_data("<")
                         k = i + 1
                     else:
                         break
                     if k < 0:
                         if not end:
                             break
    -                    k = rawdata.find('>', i + 1)
    -                    if k < 0:
    -                        k = rawdata.find('<', i + 1)
    -                        if k < 0:
    -                            k = i + 1
    -                    else:
    -                        k += 1
    -                    if self.convert_charrefs and not self.cdata_elem:
    -                        self.handle_data(unescape(rawdata[i:k]))
    +                    if starttagopen.match(rawdata, i):  # < + letter
    +                        pass
    +                    elif startswith("</", i):
    +                        if i + 2 == n:
    +                            self.handle_data("</")
    +                        elif endtagopen.match(rawdata, i):  # </ + letter
    +                            pass
    +                        else:
    +                            # bogus comment
    +                            self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<!--", i):
    +                        j = n
    +                        for suffix in ("--!", "--", "-"):
    +                            if rawdata.endswith(suffix, i+4):
    +                                j -= len(suffix)
    +                                break
    +                        self.handle_comment(rawdata[i+4:j])
    +                    elif startswith("<![CDATA[", i):
    +                        self.unknown_decl(rawdata[i+3:])
    +                    elif rawdata[i:i+9].lower() == '<!doctype':
    +                        self.handle_decl(rawdata[i+2:])
    +                    elif startswith("<!", i):
    +                        # bogus comment
    +                        self.handle_comment(rawdata[i+2:])
    +                    elif startswith("<?", i):
    +                        self.handle_pi(rawdata[i+2:])
                         else:
    -                        self.handle_data(rawdata[i:k])
    +                        raise AssertionError("we should not get here!")
    +                    k = n
                     i = self.updatepos(i, k)
                 elif startswith("&#", i):
                     match = charref.match(rawdata, i)
    
  • Lib/test/test_htmlparser.py+77 20 modified
    @@ -5,6 +5,7 @@
     import unittest
     
     from unittest.mock import patch
    +from test import support
     
     
     class EventCollector(html.parser.HTMLParser):
    @@ -430,28 +431,34 @@ def test_tolerant_parsing(self):
                                 ('data', '<'),
                                 ('starttag', 'bc<', [('a', None)]),
                                 ('endtag', 'html'),
    -                            ('data', '\n<img src="URL>'),
    -                            ('comment', '/img'),
    -                            ('endtag', 'html<')])
    +                            ('data', '\n')])
     
         def test_starttag_junk_chars(self):
    +        self._run_check("<", [('data', '<')])
    +        self._run_check("<>", [('data', '<>')])
    +        self._run_check("< >", [('data', '< >')])
    +        self._run_check("< ", [('data', '< ')])
             self._run_check("</>", [])
    +        self._run_check("<$>", [('data', '<$>')])
             self._run_check("</$>", [('comment', '$')])
             self._run_check("</", [('data', '</')])
    -        self._run_check("</a", [('data', '</a')])
    +        self._run_check("</a", [])
    +        self._run_check("</ a>", [('endtag', 'a')])
    +        self._run_check("</ a", [('comment', ' a')])
             self._run_check("<a<a>", [('starttag', 'a<a', [])])
             self._run_check("</a<a>", [('endtag', 'a<a')])
    -        self._run_check("<!", [('data', '<!')])
    -        self._run_check("<a", [('data', '<a')])
    -        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    -        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    -        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    -        self._run_check("<a foo='>", [('data', "<a foo='>")])
    +        self._run_check("<!", [('comment', '')])
    +        self._run_check("<a", [])
    +        self._run_check("<a foo='bar'", [])
    +        self._run_check("<a foo='bar", [])
    +        self._run_check("<a foo='>'", [])
    +        self._run_check("<a foo='>", [])
             self._run_check("<a$>", [('starttag', 'a$', [])])
             self._run_check("<a$b>", [('starttag', 'a$b', [])])
             self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
             self._run_check("<a$b  >", [('starttag', 'a$b', [])])
             self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    +        self._run_check("</a$b>", [('endtag', 'a$b')])
     
         def test_slashes_in_starttag(self):
             self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    @@ -576,21 +583,50 @@ def test_EOF_in_charref(self):
             for html, expected in data:
                 self._run_check(html, expected)
     
    -    def test_EOF_in_comments_or_decls(self):
    +    def test_eof_in_comments(self):
             data = [
    -            ('<!', [('data', '<!')]),
    -            ('<!-', [('data', '<!-')]),
    -            ('<!--', [('data', '<!--')]),
    -            ('<![', [('data', '<![')]),
    -            ('<![CDATA[', [('data', '<![CDATA[')]),
    -            ('<![CDATA[x', [('data', '<![CDATA[x')]),
    -            ('<!DOCTYPE', [('data', '<!DOCTYPE')]),
    -            ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
    +            ('<!--', [('comment', '')]),
    +            ('<!---', [('comment', '')]),
    +            ('<!----', [('comment', '')]),
    +            ('<!-----', [('comment', '-')]),
    +            ('<!------', [('comment', '--')]),
    +            ('<!----!', [('comment', '')]),
    +            ('<!---!', [('comment', '-!')]),
    +            ('<!---!>', [('comment', '-!>')]),
    +            ('<!--foo', [('comment', 'foo')]),
    +            ('<!--foo-', [('comment', 'foo')]),
    +            ('<!--foo--', [('comment', 'foo')]),
    +            ('<!--foo--!', [('comment', 'foo')]),
    +            ('<!--<!--', [('comment', '<!')]),
    +            ('<!--<!--!', [('comment', '<!')]),
             ]
             for html, expected in data:
                 self._run_check(html, expected)
    +
    +    def test_eof_in_declarations(self):
    +        data = [
    +            ('<!', [('comment', '')]),
    +            ('<!-', [('comment', '-')]),
    +            ('<![', [('comment', '[')]),
    +            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
    +            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
    +            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
    +            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
    +            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
    +            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
    +            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
    +            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
    +            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
    +            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
    +            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
    +             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
    +        ]
    +        for html, expected in data:
    +            self._run_check(html, expected)
    +
         def test_bogus_comments(self):
    -        html = ('<! not really a comment >'
    +        html = ('<!ELEMENT br EMPTY>'
    +                '<! not really a comment >'
                     '<! not a comment either -->'
                     '<! -- close enough -->'
                     '<!><!<-- this was an empty comment>'
    @@ -604,6 +640,7 @@ def test_bogus_comments(self):
                     '<![CDATA]]>'  # required '[' after CDATA
             )
             expected = [
    +            ('comment', 'ELEMENT br EMPTY'),
                 ('comment', ' not really a comment '),
                 ('comment', ' not a comment either --'),
                 ('comment', ' -- close enough --'),
    @@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self):
                  ('endtag', 'a'), ('data', ' bar & baz')]
             )
     
    +    @support.requires_resource('cpu')
    +    def test_eof_no_quadratic_complexity(self):
    +        # Each of these examples used to take about an hour.
    +        # Now they take a fraction of a second.
    +        def check(source):
    +            parser = html.parser.HTMLParser()
    +            parser.feed(source)
    +            parser.close()
    +        n = 120_000
    +        check("<a " * n)
    +        check("<a a=" * n)
    +        check("</a " * 14 * n)
    +        check("</a a=" * 11 * n)
    +        check("<!--" * 4 * n)
    +        check("<!" * 60 * n)
    +        check("<?" * 19 * n)
    +        check("</$" * 15 * n)
    +        check("<![CDATA[" * 9 * n)
    +        check("<!doctype" * 35 * n)
    +
     
     class AttributesTestCase(TestCaseBase):
     
    
  • Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst+4 0 added
    @@ -0,0 +1,4 @@
    +Fix quadratic complexity in processing specially crafted input in
    +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
    +to the HTML5 specs -- comments and declarations are automatically closed,
    +tags are ignored.
    

Vulnerability mechanics

Generated by null/stub on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.

References

10

News mentions

0

No linked articles in our index yet.