Skip to content

Commit b46df7f

Browse files
[3.13] gh-149489: Fix ElementTree serialization to HTML (GH-149490) (GH-150596) (GH-150609)
* The content of elements "xmp", "iframe", "noembed", "noframes", and "plaintext" is no longer escaped. * The "plaintext" element no longer have the closing tag. (cherry picked from commit c42e6d3) (cherry picked from commit bcd29e4) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 3a15d16 commit b46df7f

3 files changed

Lines changed: 32 additions & 4 deletions

File tree

Lib/test/test_xml_etree.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,7 +1248,12 @@ def check(p, expected, namespaces=None):
12481248
{'': 'http://www.w3.org/2001/XMLSchema',
12491249
'ns': 'http://www.w3.org/2001/XMLSchema'})
12501250

1251-
def test_processinginstruction(self):
1251+
def test_comment_serialization(self):
1252+
comm = ET.Comment('<spam> & ham')
1253+
# comments are not escaped
1254+
self.assertEqual(ET.tostring(comm), b'<!--<spam> & ham-->')
1255+
1256+
def test_processinginstruction_serialization(self):
12521257
# Test ProcessingInstruction directly
12531258

12541259
self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')),
@@ -1257,13 +1262,22 @@ def test_processinginstruction(self):
12571262
b'<?test instruction?>')
12581263

12591264
# Issue #2746
1260-
1265+
# processing instructions are not escaped
12611266
self.assertEqual(ET.tostring(ET.PI('test', '<testing&>')),
12621267
b'<?test <testing&>?>')
12631268
self.assertEqual(ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin-1'),
12641269
b"<?xml version='1.0' encoding='latin-1'?>\n"
12651270
b"<?test <testing&>\xe3?>")
12661271

1272+
@support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes"))
1273+
def test_html_cdata_elems_serialization(self, tag):
1274+
# content of raw text elements is not escaped in html
1275+
tag = tag.title()
1276+
elem = ET.Element(tag)
1277+
elem.text = '<spam>&ham'
1278+
self.assertEqual(ET.tostring(elem, method='html'),
1279+
('<%s><spam>&ham</%s>' % (tag, tag)).encode())
1280+
12671281
def test_html_empty_elems_serialization(self):
12681282
# issue 15970
12691283
# from http://www.w3.org/TR/html401/index/elements.html
@@ -1278,6 +1292,14 @@ def test_html_empty_elems_serialization(self):
12781292
method='html')
12791293
self.assertEqual(serialized, expected)
12801294

1295+
def test_html_plaintext_serialization(self):
1296+
# content of plaintext is not escaped in html
1297+
# no end tag for plaintext
1298+
elem = ET.Element('PlainText')
1299+
elem.text = '<spam>&ham'
1300+
self.assertEqual(ET.tostring(elem, method='html'),
1301+
b'<PlainText><spam>&ham')
1302+
12811303
def test_dump_attribute_order(self):
12821304
# See BPO 34160
12831305
e = ET.Element('cirriculum', status='public', company='example')

Lib/xml/etree/ElementTree.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -913,9 +913,12 @@ def _serialize_xml(write, elem, qnames, namespaces,
913913
if elem.tail:
914914
write(_escape_cdata(elem.tail))
915915

916+
_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed",
917+
"noframes", "plaintext"}
918+
916919
HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
917920
"img", "input", "isindex", "link", "meta", "param", "source",
918-
"track", "wbr"}
921+
"track", "wbr", "plaintext"}
919922

920923
def _serialize_html(write, elem, qnames, namespaces, **kwargs):
921924
tag = elem.tag
@@ -956,7 +959,7 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs):
956959
write(">")
957960
ltag = tag.lower()
958961
if text:
959-
if ltag == "script" or ltag == "style":
962+
if ltag in _CDATA_CONTENT_ELEMENTS:
960963
write(text)
961964
else:
962965
write(_escape_cdata(text))
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of
2+
elements "xmp", "iframe", "noembed", "noframes", and "plaintext" is no longer
3+
escaped. The "plaintext" element no longer have the closing tag.

0 commit comments

Comments
 (0)