From e41d61cfaf4b11324df05c19bd475a52cdacabc7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 7 May 2026 14:18:52 +0300 Subject: [PATCH 1/8] gh-149489: Fix ElementTree serialization to HTML * The content of comments, processing instructions and elements "xmp", "iframe", "noembed", "noframes", and "plaintext" is no longer escaped. * The "plaintext" element no longer have the closing tag. * Add support of empty attributes (with value None). --- Lib/test/test_xml_etree.py | 29 ++++++++++++++++++- Lib/xml/etree/ElementTree.py | 24 +++++++++------ ...-05-07-14-18-47.gh-issue-149489.bX9iHe.rst | 5 ++++ 3 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 8f3efe9fc90794b..b820845f3b63e21 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1278,7 +1278,13 @@ def check(p, expected, namespaces=None): {'': 'http://www.w3.org/2001/XMLSchema', 'ns': 'http://www.w3.org/2001/XMLSchema'}) - def test_processinginstruction(self): + def test_comment_serialization(self): + comm = ET.Comment(' & ham') + self.assertEqual(ET.tostring(comm), b'') + self.assertEqual(ET.tostring(comm, method='html'), b'') + self.assertEqual(ET.tostring(comm, method='text'), b' & ham') + + def test_processinginstruction_serialization(self): # Test ProcessingInstruction directly self.assertEqual(ET.tostring(ET.ProcessingInstruction('test', 'instruction')), @@ -1293,6 +1299,21 @@ def test_processinginstruction(self): self.assertEqual(ET.tostring(ET.PI('test', '\xe3'), 'latin-1'), b"\n" b"\xe3?>") + self.assertEqual(ET.tostring(ET.PI('test', 'ham & eggs < spam'), method='html'), + b'') + + def test_empty_attribute_serialization(self): + elem = ET.Element('tag', attrib={'attr': None}) + self.assertRaises(TypeError, ET.tostring, elem) + self.assertEqual(ET.tostring(elem, method='html'), b'') + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_html_cdata_elems_serialization(self, tag): + tag = tag.title() + elem = ET.Element(tag) + elem.text = '&ham' + self.assertEqual(ET.tostring(elem, method='html'), + ('<%s>&ham' % (tag, tag)).encode()) def test_html_empty_elems_serialization(self): # issue 15970 @@ -1308,6 +1329,12 @@ def test_html_empty_elems_serialization(self): method='html') self.assertEqual(serialized, expected) + def test_html_plaintext_serialization(self): + elem = ET.Element('PlainText') + elem.text = '&ham' + self.assertEqual(ET.tostring(elem, method='html'), + b'<spam>&ham') + def test_dump_attribute_order(self): # See BPO 34160 e = ET.Element('cirriculum', status='public', company='example') diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 85766e02b531ce2..7b14ec360d7cf7c 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -907,17 +907,20 @@ def _serialize_xml(write, elem, qnames, namespaces, if elem.tail: write(_escape_cdata(elem.tail)) +_CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed", + "noframes", "plaintext"} + HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr", "img", "input", "isindex", "link", "meta", "param", "source", - "track", "wbr"} + "track", "wbr", "plaintext"} def _serialize_html(write, elem, qnames, namespaces, **kwargs): tag = elem.tag text = elem.text if tag is Comment: - write("<!--%s-->" % _escape_cdata(text)) + write("<!--%s-->" % text) elif tag is ProcessingInstruction: - write("<?%s?>" % _escape_cdata(text)) + write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: @@ -941,16 +944,19 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): for k, v in items: if isinstance(k, QName): k = k.text - if isinstance(v, QName): - v = qnames[v.text] + k = qnames[k] + if v is None: + write(" %s" % k) else: - v = _escape_attrib_html(v) - # FIXME: handle boolean attributes - write(" %s=\"%s\"" % (qnames[k], v)) + if isinstance(v, QName): + v = qnames[v.text] + else: + v = _escape_attrib_html(v) + write(" %s=\"%s\"" % (k, v)) write(">") ltag = tag.lower() if text: - if ltag == "script" or ltag == "style": + if ltag in _CDATA_CONTENT_ELEMENTS: write(text) else: write(_escape_cdata(text)) diff --git a/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst new file mode 100644 index 000000000000000..1550c893fd7c45b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-07-14-18-47.gh-issue-149489.bX9iHe.rst @@ -0,0 +1,5 @@ +Fix :mod:`~xml.etree.ElementTree` serialization to HTML. The content of +comments, processing instructions and elements "xmp", "iframe", "noembed", +"noframes", and "plaintext" is no longer escaped. The "plaintext" element no +longer have the closing tag. Add support of empty attributes (with value +``None``). From a134c0b83ab6a612d44f7875efda7bb9f4625547 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Wed, 6 May 2026 22:23:29 +0300 Subject: [PATCH 2/8] gh-149468: Add option to validate ElementTree during serialization --- Doc/library/xml.etree.elementtree.rst | 30 ++- Doc/whatsnew/3.15.rst | 11 ++ Lib/test/test_xml_etree.py | 186 ++++++++++++++++++ Lib/xml/etree/ElementTree.py | 111 +++++++++-- ...-05-06-22-22-05.gh-issue-149468.IUSCzU.rst | 3 + 5 files changed, 321 insertions(+), 20 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index 310ccd651e18c7e..b8c8b8f3c009ec8 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -711,14 +711,14 @@ Functions .. function:: tostring(element, encoding="us-ascii", method="xml", *, \ xml_declaration=None, default_namespace=None, \ - short_empty_elements=True) + validate=False, short_empty_elements=True) Generates a string representation of an XML element, including all subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to generate a Unicode string (otherwise, a bytestring is generated). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - *xml_declaration*, *default_namespace* and *short_empty_elements* has the same + *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same meaning as in :meth:`ElementTree.write`. Returns an (optionally) encoded string containing the XML data. @@ -732,17 +732,20 @@ Functions The :func:`tostring` function now preserves the attribute order specified by the user. + .. versionchanged:: next + Added the *validate* parameter. + .. function:: tostringlist(element, encoding="us-ascii", method="xml", *, \ xml_declaration=None, default_namespace=None, \ - short_empty_elements=True) + validate=False, short_empty_elements=True) Generates a string representation of an XML element, including all subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to generate a Unicode string (otherwise, a bytestring is generated). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - *xml_declaration*, *default_namespace* and *short_empty_elements* has the same + *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same meaning as in :meth:`ElementTree.write`. Returns a list of (optionally) encoded strings containing the XML data. It does not guarantee any specific sequence, except that ``b"".join(tostringlist(element)) == tostring(element)``. @@ -752,6 +755,9 @@ Functions .. versionchanged:: 3.4 Added the *short_empty_elements* parameter. + .. versionchanged:: next + Added the *validate* parameter. + .. versionchanged:: 3.8 Added the *xml_declaration* and *default_namespace* parameters. @@ -759,6 +765,9 @@ Functions The :func:`tostringlist` function now preserves the attribute order specified by the user. + .. versionchanged:: next + Added the *validate* parameter. + .. function:: XML(text, parser=None) @@ -1186,7 +1195,7 @@ ElementTree Objects .. method:: write(file, encoding="us-ascii", xml_declaration=None, \ default_namespace=None, method="xml", *, \ - short_empty_elements=True) + validate=False, short_empty_elements=True) Writes the element tree to a file, as XML. *file* is a file name, or a :term:`file object` opened for writing. *encoding* [1]_ is the output @@ -1197,6 +1206,14 @@ ElementTree Objects *default_namespace* sets the default XML namespace (for "xmlns"). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). + + If *validate* is true, check that all characters are legal XML or HTML + characters, depending on *method*, element and attribute names are + valid, and the content of comments, processing instructions and + HTML elements like ``<script>`` do not contain illegal sequences, + and raise :exc:`ValueError` otherwise. + By default, no validation is performed. + The keyword-only *short_empty_elements* parameter controls the formatting of elements that contain no content. If ``True`` (the default), they are emitted as a single self-closed tag, otherwise they are emitted as a pair @@ -1216,6 +1233,9 @@ ElementTree Objects The :meth:`write` method now preserves the attribute order specified by the user. + .. versionchanged:: next + Added the *validate* parameter. + This is the XML file that is going to be manipulated:: diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 9e2f789334ff02b..3a711c1a2cfedc2 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1813,6 +1813,17 @@ xml (Contributed by Serhiy Storchaka in :gh:`139489`.) +xml.etree.ElementTree +--------------------- + +* Add the *validate* option to functions + :func:`~xml.etree.ElementTree.tostring`, + :func:`~xml.etree.ElementTree.tostringlist`, and the + :meth:`Element.write <xml.etree.ElementTree.ElementTree.write>` method, + which allows to validate the element or element tree before serialization. + (Contributed by Serhiy Storchaka in :gh:`xxxxxx`.) + + xml.parsers.expat ----------------- diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index b820845f3b63e21..55b86769af128d4 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1358,6 +1358,192 @@ def test_attlist_default(self): {'{http://www.w3.org/XML/1998/namespace}lang': 'eng'}) +class XMLValidationTest(unittest.TestCase): + + def check(self, elem, expected=None): + self.assertRaises(ValueError, + ET.tostring, elem, validate=True) + ET.tostring(elem) # no exception + + def test_invalid_comment(self): + self.check(ET.Comment('a--b')) + self.check(ET.Comment(' B+, B, or B-')) + + def test_invalid_processing_instruction(self): + self.check(ET.PI('')) + self.check(ET.PI('0')) + self.check(ET.PI('a/b')) + self.check(ET.PI('foo\xa0bar')) + self.check(ET.PI('xml')) + self.check(ET.PI('xml', 'encoding="UTF-8"')) + self.check(ET.PI('foo', 'a?>b')) + self.check(ET.PI('foo', '\x00')) + self.check(ET.PI('foo', '\ud8ff')) + self.check(ET.PI('foo', '\ufffe')) + + def test_invalid_tag(self): + self.check(ET.Element('')) + self.check(ET.Element('0')) + self.check(ET.Element('a/b')) + self.check(ET.Element(ET.QName(''))) + self.check(ET.Element(ET.QName('0'))) + self.check(ET.Element(ET.QName('a/b'))) + + def test_invalid_attr_name(self): + self.check(ET.Element('tag', attrib={'': 'value'})) + self.check(ET.Element('tag', attrib={'0': 'value'})) + self.check(ET.Element('tag', attrib={'a/b': 'value'})) + self.check(ET.Element('tag', attrib={ET.QName(''): 'value'})) + self.check(ET.Element('tag', attrib={ET.QName('0'): 'value'})) + self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'})) + + def test_invalid_attr_value(self): + self.check(ET.Element('tag', attrib={'key': '\x00'})) + self.check(ET.Element('tag', attrib={'key': '\ud8ff'})) + self.check(ET.Element('tag', attrib={'key': '\ufffe'})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\ud8ff')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\ufffe')})) + + def test_invalid_text(self): + elem = ET.Element('tag') + elem.text = '\x00' + self.check(elem) + elem.text = '\ud8ff' + self.check(elem) + elem.text = '\ufffe' + self.check(elem) + + def test_invalid_tail(self): + elem = ET.Element('tag') + elem.tail = '\x00' + self.check(elem) + elem.tail = '\ud8ff' + self.check(elem) + elem.tail = '\ufffe' + self.check(elem) + + def test_invalid_text_without_tag(self): + elem = ET.Element(None) + elem.text = '\x00' + self.check(elem) + elem.text = '\ud8ff' + self.check(elem) + elem.text = '\ufffe' + self.check(elem) + + def test_invalid_subelements(self): + elem = ET.Element('tag') + subelem = ET.SubElement(elem, 'subtag') + ET.SubElement(subelem, '\x00') + self.check(elem) + elem.tag = None + self.check(elem) + + def test_invalid_namespace_uri(self): + self.check(ET.Element('{\x00}tag')) + self.check(ET.Element('{\ud8ff}tag')) + self.check(ET.Element('{\ufffe}tag')) + self.check(ET.Element(ET.QName('\x00', 'tag'))) + self.check(ET.Element(ET.QName('\ud8ff', 'tag'))) + self.check(ET.Element(ET.QName('\ufffe', 'tag'))) + +class HTMLValidationTest(unittest.TestCase): + + def check(self, elem, expected=None): + self.assertRaises(ValueError, + ET.tostring, elem, method='html', validate=True) + ET.tostring(elem, method='html') # no exception + + def test_invalid_comment(self): + self.check(ET.Comment('>')) + self.check(ET.Comment('->')) + self.check(ET.Comment('a-->b')) + self.check(ET.Comment('a--!>b')) + self.check(ET.Comment('a\x00b')) + + def test_invalid_processing_instruction(self): + self.check(ET.PI('a>b')) + self.check(ET.PI('a\x00b')) + + def test_invalid_tag(self): + self.check(ET.Element('')) + self.check(ET.Element('?')) + self.check(ET.Element('!')) + self.check(ET.Element('0')) + self.check(ET.Element(' a')) + self.check(ET.Element('a b')) + self.check(ET.Element('a\nb')) + self.check(ET.Element('a/b')) + self.check(ET.Element('a>b')) + self.check(ET.Element('a\x00b')) + self.check(ET.Element(ET.QName(''))) + self.check(ET.Element(ET.QName('0'))) + self.check(ET.Element(ET.QName('a/b'))) + + def test_invalid_attr_name(self): + self.check(ET.Element('tag', attrib={'': 'value'})) + self.check(ET.Element('tag', attrib={'a/b': 'value'})) + self.check(ET.Element('tag', attrib={'a=b': 'value'})) + self.check(ET.Element('tag', attrib={ET.QName(''): 'value'})) + self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'})) + + def test_invalid_attr_value(self): + self.check(ET.Element('tag', attrib={'key': '\x00'})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('a"b')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('a&b')})) + + def test_invalid_text(self): + elem = ET.Element('tag') + elem.text = '\x00' + self.check(elem) + + def test_invalid_tail(self): + elem = ET.Element('tag') + elem.tail = '\x00' + self.check(elem) + + def test_invalid_text_without_tag(self): + elem = ET.Element(None) + elem.text = '\x00' + self.check(elem) + + def test_invalid_subelements(self): + elem = ET.Element('tag') + subelem = ET.SubElement(elem, 'subtag') + ET.SubElement(subelem, '\x00') + self.check(elem) + elem.tag = None + self.check(elem) + + def test_invalid_namespace_uri(self): + self.check(ET.Element('{\x00}tag')) + self.check(ET.Element(ET.QName('\x00', 'tag'))) + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_invalid_cdata_content(self, tag): + elem = ET.Element(tag.upper()) + elem.text = 'a</%s>b' % tag.title() + self.check(elem) + elem.text = 'a</%s b' % tag.title() + self.check(elem) + elem.text = 'a</%s/b' % tag.title() + self.check(elem) + elem.text = 'a\x00b' + self.check(elem) + + @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) + def test_cdata_subelements(self, tag): + elem = ET.Element(tag) + ET.SubElement(elem, 'subtag') + self.check(elem) + + def test_invalid_plaintext_content(self): + elem = ET.Element('plaintext') + elem.text = 'a\x00b' + self.check(elem) + class IterparseTest(unittest.TestCase): # Test iterparse interface. diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 7b14ec360d7cf7c..6faf348aacf01b4 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -99,6 +99,7 @@ import weakref from . import ElementPath +from .. import is_valid_name, is_valid_text class ParseError(SyntaxError): @@ -689,6 +690,7 @@ def write(self, file_or_filename, xml_declaration=None, default_namespace=None, method=None, *, + validate=False, short_empty_elements=True): """Write element tree to a file as XML. @@ -706,6 +708,8 @@ def write(self, file_or_filename, *method* -- either "xml" (default), "html, "text", or "c14n" + *validate* -- if true, validate the content + *short_empty_elements* -- controls the formatting of elements that contain no content. If True (default) they are emitted as a single self-closed @@ -737,6 +741,7 @@ def write(self, file_or_filename, qnames, namespaces = _namespaces(self._root, default_namespace) serialize = _serialize[method] serialize(write, self._root, qnames, namespaces, + validate=validate, short_empty_elements=short_empty_elements) def write_c14n(self, file): @@ -857,23 +862,39 @@ def add_qname(qname): add_qname(text.text) return qnames, namespaces -def _serialize_xml(write, elem, qnames, namespaces, - short_empty_elements, **kwargs): +def _serialize_xml(write, elem, qnames, namespaces, *, + validate, short_empty_elements, **kwargs): tag = elem.tag text = elem.text if tag is Comment: + if validate: + if '--' in text or text.endswith('-'): + raise ValueError('invalid comment') write("<!--%s-->" % text) elif tag is ProcessingInstruction: + if validate: + m = re.search('[ \t\r\n]', text) + if m is not None: + target = text[:m.start()] + else: + target = text + if (not is_valid_name(target) or target.lower() == 'xml' + or '?>' in text or not is_valid_text(text)): + raise ValueError('invalid processing instruction') write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: if text: - write(_escape_cdata(text)) + write(_escape_cdata(text, validate)) for e in elem: _serialize_xml(write, e, qnames, None, + validate=validate, short_empty_elements=short_empty_elements) else: + if validate: + if not is_valid_name(tag): + raise ValueError('invalid element name') write("<" + tag) items = list(elem.items()) if items or namespaces: @@ -882,30 +903,40 @@ def _serialize_xml(write, elem, qnames, namespaces, key=lambda x: x[1]): # sort on prefix if k: k = ":" + k + if validate: + if not is_valid_name(k): + raise ValueError('invalid namespace name') write(" xmlns%s=\"%s\"" % ( k, - _escape_attrib(v) + _escape_attrib(v, validate) )) for k, v in items: if isinstance(k, QName): k = k.text + if validate: + if not is_valid_name(qnames[k]): + raise ValueError('invalid attribute name') if isinstance(v, QName): v = qnames[v.text] + if validate: + if not is_valid_name(v): + raise ValueError('invalid attribute value') else: - v = _escape_attrib(v) + v = _escape_attrib(v, validate) write(" %s=\"%s\"" % (qnames[k], v)) if text or len(elem) or not short_empty_elements: write(">") if text: - write(_escape_cdata(text)) + write(_escape_cdata(text, validate)) for e in elem: _serialize_xml(write, e, qnames, None, + validate=validate, short_empty_elements=short_empty_elements) write("</" + tag + ">") else: write(" />") if elem.tail: - write(_escape_cdata(elem.tail)) + write(_escape_cdata(elem.tail, validate)) _CDATA_CONTENT_ELEMENTS = {"script", "style", "xmp", "iframe", "noembed", "noframes", "plaintext"} @@ -914,21 +945,34 @@ def _serialize_xml(write, elem, qnames, namespaces, "img", "input", "isindex", "link", "meta", "param", "source", "track", "wbr", "plaintext"} -def _serialize_html(write, elem, qnames, namespaces, **kwargs): +def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs): tag = elem.tag text = elem.text if tag is Comment: + if validate: + if (re.prefixmatch('-?>', text) or re.search('--!?>', text) + or '\0' in text): + raise ValueError('invalid comment') write("<!--%s-->" % text) elif tag is ProcessingInstruction: + if validate: + if '>' in text or '\0' in text: + raise ValueError('invalid processing instruction') write("<?%s?>" % text) else: tag = qnames[tag] if tag is None: if text: + if validate: + if '\0' in text: + raise ValueError('invalid characters') write(_escape_cdata(text)) for e in elem: - _serialize_html(write, e, qnames, None) + _serialize_html(write, e, qnames, None, validate=validate) else: + if validate: + if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />]*+', tag): + raise ValueError('invalid element name') write("<" + tag) items = list(elem.items()) if items or namespaces: @@ -937,6 +981,12 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): key=lambda x: x[1]): # sort on prefix if k: k = ":" + k + if validate: + if not re.fullmatch('[^\0\t\n\r\f />=]++', k): + raise ValueError('invalid attribute name') + if validate: + if '\0' in v: + raise ValueError('invalid characters') write(" xmlns%s=\"%s\"" % ( k, _escape_attrib(v) @@ -945,26 +995,49 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): if isinstance(k, QName): k = k.text k = qnames[k] + if validate: + if not re.fullmatch('[^\0\t\n\r\f />][^\0\t\n\r\f />=]*+', k): + raise ValueError('invalid attribute name') if v is None: - write(" %s" % k) + write(" %s" % (k,)) else: if isinstance(v, QName): v = qnames[v.text] + if validate: + if '\0' in v or '"' in v or '&' in v: + raise ValueError('invalid attribute value') else: + if validate: + if '\0' in v: + raise ValueError('invalid attribute value') v = _escape_attrib_html(v) write(" %s=\"%s\"" % (k, v)) write(">") ltag = tag.lower() if text: + if validate: + if '\0' in text: + raise ValueError('invalid characters') if ltag in _CDATA_CONTENT_ELEMENTS: + if validate: + if (ltag != "plaintext" + and re.search(r'</%s(?=[\t\n\r\f />])' % ltag, + text, re.IGNORECASE|re.ASCII)): + raise ValueError('invalid %s content' % ltag) write(text) else: write(_escape_cdata(text)) + if validate: + if ltag in _CDATA_CONTENT_ELEMENTS and len(elem): + raise ValueError('subelements in %s element' % ltag) for e in elem: - _serialize_html(write, e, qnames, None) + _serialize_html(write, e, qnames, None, validate=validate) if ltag not in HTML_EMPTY: write("</" + tag + ">") if elem.tail: + if validate: + if '\0' in elem.tail: + raise ValueError('invalid characters') write(_escape_cdata(elem.tail)) def _serialize_text(write, elem): @@ -1021,9 +1094,12 @@ def _raise_serialization_error(text): "cannot serialize %r (type %s)" % (text, type(text).__name__) ) -def _escape_cdata(text): +def _escape_cdata(text, validate=False): # escape character data try: + if validate: + if not is_valid_text(text): + raise ValueError('invalid characters') # it's worth avoiding do-nothing calls for strings that are # shorter than 500 characters, or so. assume that's, by far, # the most common case in most applications. @@ -1037,9 +1113,12 @@ def _escape_cdata(text): except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_attrib(text): +def _escape_attrib(text, validate=False): # escape attribute value try: + if validate: + if not is_valid_text(text): + raise ValueError('invalid attribute value') if "&" in text: text = text.replace("&", "&amp;") if "<" in text: @@ -1082,7 +1161,7 @@ def _escape_attrib_html(text): def tostring(element, encoding=None, method=None, *, xml_declaration=None, default_namespace=None, - short_empty_elements=True): + validate=False, short_empty_elements=True): """Generate string representation of XML element. All subelements are included. If encoding is "unicode", a string @@ -1101,6 +1180,7 @@ def tostring(element, encoding=None, method=None, *, xml_declaration=xml_declaration, default_namespace=default_namespace, method=method, + validate=validate, short_empty_elements=short_empty_elements) return stream.getvalue() @@ -1123,13 +1203,14 @@ def tell(self): def tostringlist(element, encoding=None, method=None, *, xml_declaration=None, default_namespace=None, - short_empty_elements=True): + validate=False, short_empty_elements=True): lst = [] stream = _ListDataStream(lst) ElementTree(element).write(stream, encoding, xml_declaration=xml_declaration, default_namespace=default_namespace, method=method, + validate=validate, short_empty_elements=short_empty_elements) return lst diff --git a/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst b/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst new file mode 100644 index 000000000000000..a4313cac07eea56 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst @@ -0,0 +1,3 @@ +Add the *validate* option to :mod:`xml.etree.ElementTree` serialization +functions, which allows to validate the element or element tree before +serialization. From ea414fa2a596066099e307bed88599bdfb65806f Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sun, 31 May 2026 11:37:14 +0300 Subject: [PATCH 3/8] Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> --- Doc/library/xml.etree.elementtree.rst | 20 +++++++++----------- Lib/test/test_xml_etree.py | 2 ++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index b8c8b8f3c009ec8..27f1a998ac65104 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -718,7 +718,7 @@ Functions the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to generate a Unicode string (otherwise, a bytestring is generated). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same + *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* have the same meaning as in :meth:`ElementTree.write`. Returns an (optionally) encoded string containing the XML data. @@ -745,7 +745,7 @@ Functions the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to generate a Unicode string (otherwise, a bytestring is generated). *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* has the same + *xml_declaration*, *default_namespace*, *validate* and *short_empty_elements* have the same meaning as in :meth:`ElementTree.write`. Returns a list of (optionally) encoded strings containing the XML data. It does not guarantee any specific sequence, except that ``b"".join(tostringlist(element)) == tostring(element)``. @@ -755,9 +755,6 @@ Functions .. versionchanged:: 3.4 Added the *short_empty_elements* parameter. - .. versionchanged:: next - Added the *validate* parameter. - .. versionchanged:: 3.8 Added the *xml_declaration* and *default_namespace* parameters. @@ -1207,12 +1204,13 @@ ElementTree Objects *method* is either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). - If *validate* is true, check that all characters are legal XML or HTML - characters, depending on *method*, element and attribute names are - valid, and the content of comments, processing instructions and - HTML elements like ``<script>`` do not contain illegal sequences, - and raise :exc:`ValueError` otherwise. - By default, no validation is performed. + If *validate* is true, check that all characters are legal, + that element and attribute names are valid, and that the content + of comments, processing instructions and HTML elements + like ``<script>`` do not contain illegal sequences according + to the selected *method* (``"xml"`` or ``"html"``). + Raise :exc:`ValueError` if any check fails. + By default, or if *method* is ``"text"``, no validation is performed. The keyword-only *short_empty_elements* parameter controls the formatting of elements that contain no content. If ``True`` (the default), they are diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index d27bcedc16c5882..6a4afb3c30971f7 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1477,6 +1477,7 @@ def test_invalid_namespace_uri(self): self.check(ET.Element(ET.QName('\ud8ff', 'tag'))) self.check(ET.Element(ET.QName('\ufffe', 'tag'))) + class HTMLValidationTest(unittest.TestCase): def check(self, elem, expected=None): @@ -1573,6 +1574,7 @@ def test_invalid_plaintext_content(self): elem.text = 'a\x00b' self.check(elem) + class IterparseTest(unittest.TestCase): # Test iterparse interface. From 474411fb36885c4887251f3487fa72afabcab0fa Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sun, 31 May 2026 12:02:27 +0300 Subject: [PATCH 4/8] Add more tests for processing instructions --- Lib/test/test_xml_etree.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 6a4afb3c30971f7..6fd2c13be914521 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1389,11 +1389,14 @@ def test_attlist_default(self): class XMLValidationTest(unittest.TestCase): - def check(self, elem, expected=None): + def check(self, elem): self.assertRaises(ValueError, ET.tostring, elem, validate=True) ET.tostring(elem) # no exception + def check_valid(self, elem, expected): + self.assertEqual(ET.tostring(elem, validate=True), expected) + def test_invalid_comment(self): self.check(ET.Comment('a--b')) self.check(ET.Comment(' B+, B, or B-')) @@ -1403,13 +1406,19 @@ def test_invalid_processing_instruction(self): self.check(ET.PI('0')) self.check(ET.PI('a/b')) self.check(ET.PI('foo\xa0bar')) + self.check(ET.PI('foo\fbar')) self.check(ET.PI('xml')) + self.check(ET.PI('XML')) self.check(ET.PI('xml', 'encoding="UTF-8"')) self.check(ET.PI('foo', 'a?>b')) self.check(ET.PI('foo', '\x00')) self.check(ET.PI('foo', '\ud8ff')) self.check(ET.PI('foo', '\ufffe')) + self.check_valid(ET.PI('foo\tbar'), b'<?foo\tbar?>') + self.check_valid(ET.PI('foo\nbar'), b'<?foo\nbar?>') + self.check_valid(ET.PI('foo\rbar'), b'<?foo\rbar?>') + def test_invalid_tag(self): self.check(ET.Element('')) self.check(ET.Element('0')) @@ -1480,7 +1489,7 @@ def test_invalid_namespace_uri(self): class HTMLValidationTest(unittest.TestCase): - def check(self, elem, expected=None): + def check(self, elem): self.assertRaises(ValueError, ET.tostring, elem, method='html', validate=True) ET.tostring(elem, method='html') # no exception From 22e5543081583bab1e274b8d9de153792c24d7a6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sun, 31 May 2026 12:29:41 +0300 Subject: [PATCH 5/8] Add more details in exceptions. --- Lib/xml/etree/ElementTree.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 3a937470073878c..f0981927c93290e 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -890,7 +890,7 @@ def _serialize_xml(write, elem, qnames, namespaces, *, target = text if (not is_valid_name(target) or target.lower() == 'xml' or '?>' in text or not is_valid_text(text)): - raise ValueError('invalid processing instruction') + raise ValueError(f'invalid processing instruction {elem.text!r}') write("<?%s?>" % text) else: tag = qnames[tag] @@ -904,7 +904,7 @@ def _serialize_xml(write, elem, qnames, namespaces, *, else: if validate: if not is_valid_name(tag): - raise ValueError('invalid element name') + raise ValueError(f'invalid element name {tag!r}') write("<" + tag) items = list(elem.items()) if items or namespaces: @@ -915,7 +915,7 @@ def _serialize_xml(write, elem, qnames, namespaces, *, k = ":" + k if validate: if not is_valid_name(k): - raise ValueError('invalid namespace name') + raise ValueError(f'invalid namespace name {k[1:]!r}') write(" xmlns%s=\"%s\"" % ( k, _escape_attrib(v, validate) @@ -925,12 +925,12 @@ def _serialize_xml(write, elem, qnames, namespaces, *, k = k.text if validate: if not is_valid_name(qnames[k]): - raise ValueError('invalid attribute name') + raise ValueError(f'invalid attribute name {k!r}') if isinstance(v, QName): v = qnames[v.text] if validate: if not is_valid_name(v): - raise ValueError('invalid attribute value') + raise ValueError(f'invalid attribute value {v!r}') else: v = _escape_attrib(v, validate) write(" %s=\"%s\"" % (qnames[k], v)) @@ -967,7 +967,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) elif tag is ProcessingInstruction: if validate: if '>' in text or '\0' in text: - raise ValueError('invalid processing instruction') + raise ValueError(f'invalid processing instruction {text!r}') write("<?%s?>" % text) else: tag = qnames[tag] @@ -982,7 +982,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) else: if validate: if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />]*+', tag): - raise ValueError('invalid element name') + raise ValueError(f'invalid element name {tag!r}') write("<" + tag) items = list(elem.items()) if items or namespaces: @@ -993,7 +993,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) k = ":" + k if validate: if not re.fullmatch('[^\0\t\n\r\f />=]++', k): - raise ValueError('invalid attribute name') + raise ValueError(f'invalid attribute name {k!r}') if validate: if '\0' in v: raise ValueError('invalid characters') @@ -1007,7 +1007,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) k = qnames[k] if validate: if not re.fullmatch('[^\0\t\n\r\f />][^\0\t\n\r\f />=]*+', k): - raise ValueError('invalid attribute name') + raise ValueError(f'invalid attribute name {k!r}') if v is None: write(" %s" % k) # empty attr else: @@ -1015,11 +1015,11 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) v = qnames[v.text] if validate: if '\0' in v or '"' in v or '&' in v: - raise ValueError('invalid attribute value') + raise ValueError(f'invalid attribute value {v!r}') else: if validate: if '\0' in v: - raise ValueError('invalid attribute value') + raise ValueError(f'invalid attribute value {v!r}') v = _escape_attrib_html(v) write(" %s=\"%s\"" % (k, v)) write(">") From e12e88d698c8622d3ac8eb505400c902696b3b4e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sun, 31 May 2026 13:49:58 +0300 Subject: [PATCH 6/8] Check also for surrogates in HTML. --- Lib/test/test_xml_etree.py | 26 ++++++++++++++++++++++++++ Lib/xml/etree/ElementTree.py | 29 ++++++++++++++++------------- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 6fd2c13be914521..d9f676446ba4c71 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -1400,6 +1400,10 @@ def check_valid(self, elem, expected): def test_invalid_comment(self): self.check(ET.Comment('a--b')) self.check(ET.Comment(' B+, B, or B-')) + self.check(ET.Comment('\x00')) + self.check(ET.Comment('\x01')) + self.check(ET.Comment('\ud8ff')) + self.check(ET.Comment('\ufffe')) def test_invalid_processing_instruction(self): self.check(ET.PI('')) @@ -1412,6 +1416,7 @@ def test_invalid_processing_instruction(self): self.check(ET.PI('xml', 'encoding="UTF-8"')) self.check(ET.PI('foo', 'a?>b')) self.check(ET.PI('foo', '\x00')) + self.check(ET.PI('foo', '\x01')) self.check(ET.PI('foo', '\ud8ff')) self.check(ET.PI('foo', '\ufffe')) @@ -1500,10 +1505,12 @@ def test_invalid_comment(self): self.check(ET.Comment('a-->b')) self.check(ET.Comment('a--!>b')) self.check(ET.Comment('a\x00b')) + self.check(ET.Comment('a\ud8ffb')) def test_invalid_processing_instruction(self): self.check(ET.PI('a>b')) self.check(ET.PI('a\x00b')) + self.check(ET.PI('a\ud8ffb')) def test_invalid_tag(self): self.check(ET.Element('')) @@ -1516,20 +1523,27 @@ def test_invalid_tag(self): self.check(ET.Element('a/b')) self.check(ET.Element('a>b')) self.check(ET.Element('a\x00b')) + self.check(ET.Element('a\ud8ffb')) self.check(ET.Element(ET.QName(''))) self.check(ET.Element(ET.QName('0'))) self.check(ET.Element(ET.QName('a/b'))) def test_invalid_attr_name(self): self.check(ET.Element('tag', attrib={'': 'value'})) + self.check(ET.Element('tag', attrib={'\x00': 'value'})) + self.check(ET.Element('tag', attrib={'\ud8ff': 'value'})) self.check(ET.Element('tag', attrib={'a/b': 'value'})) self.check(ET.Element('tag', attrib={'a=b': 'value'})) + self.check(ET.Element('tag', attrib={'a\x00b': 'value'})) + self.check(ET.Element('tag', attrib={'a\ud8ffb': 'value'})) self.check(ET.Element('tag', attrib={ET.QName(''): 'value'})) self.check(ET.Element('tag', attrib={ET.QName('a/b'): 'value'})) def test_invalid_attr_value(self): self.check(ET.Element('tag', attrib={'key': '\x00'})) + self.check(ET.Element('tag', attrib={'key': '\ud8ff'})) self.check(ET.Element('tag', attrib={'key': ET.QName('\x00')})) + self.check(ET.Element('tag', attrib={'key': ET.QName('\ud8ff')})) self.check(ET.Element('tag', attrib={'key': ET.QName('a"b')})) self.check(ET.Element('tag', attrib={'key': ET.QName('a&b')})) @@ -1537,16 +1551,22 @@ def test_invalid_text(self): elem = ET.Element('tag') elem.text = '\x00' self.check(elem) + elem.text = '\ud8ff' + self.check(elem) def test_invalid_tail(self): elem = ET.Element('tag') elem.tail = '\x00' self.check(elem) + elem.tail = '\ud8ff' + self.check(elem) def test_invalid_text_without_tag(self): elem = ET.Element(None) elem.text = '\x00' self.check(elem) + elem.text = '\ud8ff' + self.check(elem) def test_invalid_subelements(self): elem = ET.Element('tag') @@ -1558,7 +1578,9 @@ def test_invalid_subelements(self): def test_invalid_namespace_uri(self): self.check(ET.Element('{\x00}tag')) + self.check(ET.Element('{\ud8ff}tag')) self.check(ET.Element(ET.QName('\x00', 'tag'))) + self.check(ET.Element(ET.QName('\ud8ff', 'tag'))) @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) def test_invalid_cdata_content(self, tag): @@ -1571,6 +1593,8 @@ def test_invalid_cdata_content(self, tag): self.check(elem) elem.text = 'a\x00b' self.check(elem) + elem.text = 'a\ud8ffb' + self.check(elem) @support.subTests('tag', ("script", "style", "xmp", "iframe", "noembed", "noframes")) def test_cdata_subelements(self, tag): @@ -1582,6 +1606,8 @@ def test_invalid_plaintext_content(self): elem = ET.Element('plaintext') elem.text = 'a\x00b' self.check(elem) + elem.text = 'a\ud8ffb' + self.check(elem) class IterparseTest(unittest.TestCase): diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index f0981927c93290e..53b6aaf4898a0cb 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -878,7 +878,7 @@ def _serialize_xml(write, elem, qnames, namespaces, *, text = elem.text if tag is Comment: if validate: - if '--' in text or text.endswith('-'): + if '--' in text or text.endswith('-') or not is_valid_text(text): raise ValueError('invalid comment') write("<!--%s-->" % text) elif tag is ProcessingInstruction: @@ -955,18 +955,21 @@ def _serialize_xml(write, elem, qnames, namespaces, *, "img", "input", "isindex", "link", "meta", "param", "source", "track", "wbr", "plaintext"} +def _is_valid_html_text(text): + return re.search('[\x00\ud800-\udfff]', text) is None + def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs): tag = elem.tag text = elem.text if tag is Comment: if validate: if (re.prefixmatch('-?>', text) or re.search('--!?>', text) - or '\0' in text): + or not _is_valid_html_text(text)): raise ValueError('invalid comment') write("<!--%s-->" % text) elif tag is ProcessingInstruction: if validate: - if '>' in text or '\0' in text: + if '>' in text or not _is_valid_html_text(text): raise ValueError(f'invalid processing instruction {text!r}') write("<?%s?>" % text) else: @@ -974,14 +977,14 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) if tag is None: if text: if validate: - if '\0' in text: + if not _is_valid_html_text(text): raise ValueError('invalid characters') write(_escape_cdata(text)) for e in elem: _serialize_html(write, e, qnames, None, validate=validate) else: if validate: - if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />]*+', tag): + if not re.fullmatch('[A-Za-z][^\0\t\n\r\f />\ud800-\udfff]*+', tag): raise ValueError(f'invalid element name {tag!r}') write("<" + tag) items = list(elem.items()) @@ -992,10 +995,10 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) if k: k = ":" + k if validate: - if not re.fullmatch('[^\0\t\n\r\f />=]++', k): - raise ValueError(f'invalid attribute name {k!r}') + if not re.fullmatch('[^\0\t\n\r\f />=\ud800-\udfff]++', k): + raise ValueError(f'invalid namespace name {k[1:]!r}') if validate: - if '\0' in v: + if not _is_valid_html_text(v): raise ValueError('invalid characters') write(" xmlns%s=\"%s\"" % ( k, @@ -1006,7 +1009,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) k = k.text k = qnames[k] if validate: - if not re.fullmatch('[^\0\t\n\r\f />][^\0\t\n\r\f />=]*+', k): + if not re.fullmatch('[^\0\t\n\r\f />\ud800-\udfff][^\0\t\n\r\f />=\ud800-\udfff]*+', k): raise ValueError(f'invalid attribute name {k!r}') if v is None: write(" %s" % k) # empty attr @@ -1014,11 +1017,11 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) if isinstance(v, QName): v = qnames[v.text] if validate: - if '\0' in v or '"' in v or '&' in v: + if re.search('[\0"&\ud800-\udfff]', v): raise ValueError(f'invalid attribute value {v!r}') else: if validate: - if '\0' in v: + if not _is_valid_html_text(v): raise ValueError(f'invalid attribute value {v!r}') v = _escape_attrib_html(v) write(" %s=\"%s\"" % (k, v)) @@ -1026,7 +1029,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) ltag = tag.lower() if text: if validate: - if '\0' in text: + if not _is_valid_html_text(text): raise ValueError('invalid characters') if ltag in _CDATA_CONTENT_ELEMENTS: if validate: @@ -1046,7 +1049,7 @@ def _serialize_html(write, elem, qnames, namespaces, *, validate=True, **kwargs) write("</" + tag + ">") if elem.tail: if validate: - if '\0' in elem.tail: + if not _is_valid_html_text(elem.tail): raise ValueError('invalid characters') write(_escape_cdata(elem.tail)) From 8cc34e70929fee7fd975d6afccd2cc93f3161ab6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sun, 31 May 2026 14:16:45 +0300 Subject: [PATCH 7/8] Move the What's New entry to 3.16. --- Doc/whatsnew/3.15.rst | 11 ----------- Doc/whatsnew/3.16.rst | 10 ++++++++++ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 4e6f1cc50d253b4..1d27baf38906e9a 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1830,17 +1830,6 @@ xml (Contributed by Serhiy Storchaka in :gh:`139489`.) -xml.etree.ElementTree ---------------------- - -* Add the *validate* option to functions - :func:`~xml.etree.ElementTree.tostring`, - :func:`~xml.etree.ElementTree.tostringlist`, and the - :meth:`Element.write <xml.etree.ElementTree.ElementTree.write>` method, - which allows to validate the element or element tree before serialization. - (Contributed by Serhiy Storchaka in :gh:`xxxxxx`.) - - xml.parsers.expat ----------------- diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index 9a0a0d3d8831f5f..4f8b5485ccadbc6 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -209,6 +209,16 @@ tarfile * The undocumented and unused :attr:`!tarfile.TarFile.tarfile` attribute has been deprecated since Python 3.13. +xml.etree.ElementTree +--------------------- + +* Add the *validate* option to functions + :func:`~xml.etree.ElementTree.tostring`, + :func:`~xml.etree.ElementTree.tostringlist`, and the + :meth:`Element.write <xml.etree.ElementTree.ElementTree.write>` method, + which allows to validate the element or element tree before serialization. + (Contributed by Serhiy Storchaka in :gh:`149468`.) + .. Add removals above alphabetically, not here at the end. From 3d0fdd2e227da4d227e6c5ea896fec04284ebb67 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka <storchaka@gmail.com> Date: Sun, 31 May 2026 14:26:15 +0300 Subject: [PATCH 8/8] Update the NEWS entry. --- .../Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst b/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst index a4313cac07eea56..10be07afa8b36ee 100644 --- a/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst +++ b/Misc/NEWS.d/next/Library/2026-05-06-22-22-05.gh-issue-149468.IUSCzU.rst @@ -1,3 +1,6 @@ Add the *validate* option to :mod:`xml.etree.ElementTree` serialization -functions, which allows to validate the element or element tree before -serialization. +functions, which allows to check that all characters are legal, +that element and attribute names are valid, and that the content +of comments, processing instructions and HTML elements +like ``<script>`` do not contain illegal sequences according +to the selected *method* (``"xml"`` or ``"html"``).