From e41d61cfaf4b11324df05c19bd475a52cdacabc7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 7 May 2026 14:18:52 +0300 Subject: [PATCH 1/4] gh-149489: Fix ElementTree serialization to HTML * The content of comments, processing instructions and elements "xmp", "iframe", "noembed", "noframes", and "plaintext" is no longer escaped. * The "plaintext" element no longer have the closing tag. * Add support of empty attributes (with value None). --- Lib/test/test_xml_etree.py | 29 ++++++++++++++++++- Lib/xml/etree/ElementTree.py | 24 +++++++++------ ...-05-07-14-18-47.gh-issue-149489.bX9iHe.rst | 5 ++++ 3 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 8f3efe9fc90794..b820845f3b63e2 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1278,7 +1278,13 @@ def check(p, expected, namespaces=None): {'': 'http://www.w3.org/2001/XMLSchema', 'ns': 'http://www.w3.org/2001/XMLSchema'}) - def test_processinginstruction(self): + def test_comment_serialization(self): + comm = ET.Comment(' & ham') + self.assertEqual(ET.tostring(comm), b'') + self.assertEqual(ET.tostring(comm, method='html'), b'') + self.assertEqual(ET.tostring(comm, method='text'), b' & ham') + + def test_processinginstruction_serialization(self): # Test ProcessingInstruction directly self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')), @@ -1293,6 +1299,21 @@ def test_processinginstruction(self): self.assertEqual(ET.tostring(ET.PI('test', '\xe3'), 'latin-1'), b"\n" b"\xe3?>") + self.assertEqual(ET.tostring(ET.PI('test', 'ham & eggs < spam'), method='html'), + b'') + + def test_empty_attribute_serialization(self): + elem = ET.Element('tag', attrib={'attr': None}) + self.assertRaises(TypeError, ET.tostring, elem) + self.assertEqual(ET.tostring(elem, method='html'), b'') + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_html_cdata_elems_serialization(self, tag): + tag = tag.title() + elem = ET.Element(tag) + elem.text = '&ham' + self.assertEqual(ET.tostring(elem, method='html'), + ('<%s>&ham' % (tag, tag)).encode()) def test_html_empty_elems_serialization(self): # issue 15970 @@ -1308,6 +1329,12 @@ def test_html_empty_elems_serialization(self): method='html') self.assertEqual(serialized, expected) + def test_html_plaintext_serialization(self): + elem = ET.Element('PlainText') + elem.text = '&ham' + self.assertEqual(ET.tostring(elem, method='html'), + b'<spam>&ham') + def test_dump_attribute_order(self): # See BPO 34160 e = ET.Element('cirriculum', status='public', company='example') diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 85766e02b531ce..7b14ec360d7cf7 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -907,17 +907,20 @@ def _serialize_xml(write, elem, qnames, namespaces, if elem.tail: write(_escape_cdata(elem.tail)) +_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed", + "noframes", "plaintext"} + HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr", "img", "input", "isindex", "link", "meta", "param", "source", - "track", "wbr"} + "track", "wbr", "plaintext"} def _serialize_html(write, elem, qnames, namespaces, **kwargs): tag = elem.tag text = elem.text if tag is Comment: - write("<!--%s-->" % _escape_cdata(text)) + write("<!--%s-->" % text) elif tag is ProcessingInstruction: - write("<?%s?>" % _escape_cdata(text)) + write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: @@ -941,16 +944,19 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): for k, v in items: if isinstance(k, QName): k = k.text - if isinstance(v, QName): - v = qnames[v.text] + k = qnames[k] + if v is None: + write(" %s" % k) else: - v = _escape_attrib_html(v) - # FIXME: handle boolean attributes - write(" %s=\"%s\"" % (qnames[k], v)) + if isinstance(v, QName): + v = qnames[v.text] + else: + v = _escape_attrib_html(v) + write(" %s=\"%s\"" % (k, v)) write(">") ltag = tag.lower() if text: - if ltag == "script" or ltag == "style": + if ltag in _CDATA_CONTENT_ELEMENTS: write(text) else: write(_escape_cdata(text)) diff --git a/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst new file mode 100644 index 00000000000000..1550c893fd7c45 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst @@ -0,0 +1,5 @@ +Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of +comments, processing instructions and elements "xmp", "iframe", "noembed", +"noframes", and "plaintext" is no longer escaped. The "plaintext" element no +longer have the closing tag. Add support of empty attributes (with value +``None``). From e111ef9f3d02afba3cc2e28b27ee16983fda1f40 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Thu, 28 May 2026 18:16:54 +0300 Subject: [PATCH 2/4] Update tests after fixing itertext(). --- Lib/test/test_xml_etree.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index b2c60178e6ef26..a0b2f682977711 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1291,7 +1291,7 @@ def test_comment_serialization(self): comm = ET.Comment('<spam> & ham') self.assertEqual(ET.tostring(comm), b'<!--<spam> & ham-->') self.assertEqual(ET.tostring(comm, method='html'), b'<!--<spam> & ham-->') - self.assertEqual(ET.tostring(comm, method='text'), b'<spam> & ham') + self.assertEqual(ET.tostring(comm, method='text'), b'') def test_processinginstruction_serialization(self): # Test ProcessingInstruction directly @@ -1308,8 +1308,10 @@ def test_processinginstruction_serialization(self): self.assertEqual(ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin-1'), b"<?xml version='1.0' encoding='latin-1'?>\n" b"<?test <testing&>\xe3?>") - self.assertEqual(ET.tostring(ET.PI('test', 'ham & eggs < spam'), method='html'), - b'<?test ham & eggs < spam?>') + pi = ET.PI('test', 'ham & eggs < spam') + self.assertEqual(ET.tostring(pi), b'<?test ham & eggs < spam?>') + self.assertEqual(ET.tostring(pi, method='html'), b'<?test ham & eggs < spam?>') + self.assertEqual(ET.tostring(pi, method='text'), b'') def test_empty_attribute_serialization(self): elem = ET.Element('tag', attrib={'attr': None}) From 752f4e7cbc1deb516cab2c6ecf32b0107eee57f8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Fri, 29 May 2026 09:14:04 +0300 Subject: [PATCH 3/4] Apply suggestion from @ezio-melotti Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> --- Lib/xml/etree/ElementTree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 025cda4bdecf6b..53727d7940b3f2 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -956,7 +956,7 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): k = k.text k = qnames[k] if v is None: - write(" %s" % k) + write(" %s" % k) # empty attr else: if isinstance(v, QName): v = qnames[v.text] From 57b573937f94ecc968f2d9a3d4a3e2dfcc4f31fd Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Fri, 29 May 2026 09:54:18 +0300 Subject: [PATCH 4/4] Add comments to tests. --- Lib/test/test_xml_etree.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index a0b2f682977711..89aff568a1b4ef 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1289,8 +1289,10 @@ def check(p, expected, namespaces=None): def test_comment_serialization(self): comm = ET.Comment('<spam> & ham') + # comments are not escaped self.assertEqual(ET.tostring(comm), b'<!--<spam> & ham-->') self.assertEqual(ET.tostring(comm, method='html'), b'<!--<spam> & ham-->') + # no comments in text serialization self.assertEqual(ET.tostring(comm, method='text'), b'') def test_processinginstruction_serialization(self): @@ -1302,7 +1304,7 @@ def test_processinginstruction_serialization(self): b'<?test instruction?>') # Issue #2746 - + # processing instructions are not escaped self.assertEqual(ET.tostring(ET.PI('test', '<testing&>')), b'<?test <testing&>?>') self.assertEqual(ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin-1'), @@ -1311,15 +1313,18 @@ def test_processinginstruction_serialization(self): pi = ET.PI('test', 'ham & eggs < spam') self.assertEqual(ET.tostring(pi), b'<?test ham & eggs < spam?>') self.assertEqual(ET.tostring(pi, method='html'), b'<?test ham & eggs < spam?>') + # no processing instructions in text serialization self.assertEqual(ET.tostring(pi, method='text'), b'') def test_empty_attribute_serialization(self): + # empty attrs only work in html elem = ET.Element('tag', attrib={'attr': None}) self.assertRaises(TypeError, ET.tostring, elem) self.assertEqual(ET.tostring(elem, method='html'), b'<tag attr></tag>') @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) def test_html_cdata_elems_serialization(self, tag): + # content of raw text elements is not escaped in html tag = tag.title() elem = ET.Element(tag) elem.text = '<spam>&ham' @@ -1341,6 +1346,8 @@ def test_html_empty_elems_serialization(self): self.assertEqual(serialized, expected) def test_html_plaintext_serialization(self): + # content of plaintext is not escaped in html + # no end tag for plaintext elem = ET.Element('PlainText') elem.text = '<spam>&ham' self.assertEqual(ET.tostring(elem, method='html'),