Skip to content

Commit aa52e04

Browse files
committed
bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree.
1 parent 2d2df11 commit aa52e04

File tree

5 files changed

+221
-28
lines changed

5 files changed

+221
-28
lines changed

Doc/library/xml.etree.elementtree.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,18 @@ XMLParser Objects
11691169
>>> parser.close()
11701170
4
11711171

1172+
Additionally, if the target object provides one or both of the methods
1173+
``start_ns(self, prefix, uri)`` and ``end_ns(self, prefix)``, then they
1174+
are called whenever the parser encounters a new namespace declaration.
1175+
The ``prefix`` is ``''`` for the default namespace and the declared
1176+
namespace prefix otherwise. The ``start_ns()`` method is called before
1177+
the ``start()`` callback of the opening tag that defines the namespace,
1178+
and the ``end_ns()`` method is called after the corresponding ``end()``
1179+
callback.
1180+
1181+
.. versionchanged:: 3.8
1182+
The ``start_ns()`` and ``end_ns()`` callbacks were added.
1183+
11721184

11731185
.. _elementtree-xmlpullparser-objects:
11741186

Lib/test/test_xml_etree.py

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import warnings
1919
import weakref
2020

21-
from itertools import product
21+
from itertools import product, islice
2222
from test import support
2323
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
2424

@@ -693,12 +693,17 @@ def pi(self, target, data):
693693
self.append(("pi", target, data))
694694
def comment(self, data):
695695
self.append(("comment", data))
696+
def start_ns(self, prefix, uri):
697+
self.append(("start-ns", prefix, uri))
698+
def end_ns(self, prefix):
699+
self.append(("end-ns", prefix))
696700
builder = Builder()
697701
parser = ET.XMLParser(target=builder)
698702
parser.feed(data)
699703
self.assertEqual(builder, [
700704
('pi', 'pi', 'data'),
701705
('comment', ' comment '),
706+
('start-ns', '', 'namespace'),
702707
('start', '{namespace}root'),
703708
('start', '{namespace}element'),
704709
('end', '{namespace}element'),
@@ -707,6 +712,7 @@ def comment(self, data):
707712
('start', '{namespace}empty-element'),
708713
('end', '{namespace}empty-element'),
709714
('end', '{namespace}root'),
715+
('end-ns', ''),
710716
])
711717

712718

@@ -1193,14 +1199,19 @@ def _feed(self, parser, data, chunk_size=None):
11931199
for i in range(0, len(data), chunk_size):
11941200
parser.feed(data[i:i+chunk_size])
11951201

1196-
def assert_events(self, parser, expected):
1202+
def assert_events(self, parser, expected, max_events=None):
11971203
self.assertEqual(
11981204
[(event, (elem.tag, elem.text))
1199-
for event, elem in parser.read_events()],
1205+
for event, elem in islice(parser.read_events(), max_events)],
12001206
expected)
12011207

1202-
def assert_event_tags(self, parser, expected):
1203-
events = parser.read_events()
1208+
def assert_event_tuples(self, parser, expected, max_events=None):
1209+
self.assertEqual(
1210+
list(islice(parser.read_events(), max_events)),
1211+
expected)
1212+
1213+
def assert_event_tags(self, parser, expected, max_events=None):
1214+
events = islice(parser.read_events(), max_events)
12041215
self.assertEqual([(action, elem.tag) for action, elem in events],
12051216
expected)
12061217

@@ -1275,6 +1286,56 @@ def test_ns_events(self):
12751286
self.assertEqual(list(parser.read_events()), [('end-ns', None)])
12761287
self.assertIsNone(parser.close())
12771288

1289+
def test_ns_events_start(self):
1290+
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end'))
1291+
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
1292+
self.assert_event_tuples(parser, [
1293+
('start-ns', ('', 'abc')),
1294+
('start-ns', ('p', 'xyz')),
1295+
], max_events=2)
1296+
self.assert_event_tags(parser, [
1297+
('start', '{abc}tag'),
1298+
], max_events=1)
1299+
1300+
self._feed(parser, "<child />\n")
1301+
self.assert_event_tags(parser, [
1302+
('start', '{abc}child'),
1303+
('end', '{abc}child'),
1304+
])
1305+
1306+
self._feed(parser, "</tag>\n")
1307+
parser.close()
1308+
self.assert_event_tags(parser, [
1309+
('end', '{abc}tag'),
1310+
])
1311+
1312+
def test_ns_events_start_end(self):
1313+
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
1314+
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
1315+
self.assert_event_tuples(parser, [
1316+
('start-ns', ('', 'abc')),
1317+
('start-ns', ('p', 'xyz')),
1318+
], max_events=2)
1319+
self.assert_event_tags(parser, [
1320+
('start', '{abc}tag'),
1321+
], max_events=1)
1322+
1323+
self._feed(parser, "<child />\n")
1324+
self.assert_event_tags(parser, [
1325+
('start', '{abc}child'),
1326+
('end', '{abc}child'),
1327+
])
1328+
1329+
self._feed(parser, "</tag>\n")
1330+
parser.close()
1331+
self.assert_event_tags(parser, [
1332+
('end', '{abc}tag'),
1333+
], max_events=1)
1334+
self.assert_event_tuples(parser, [
1335+
('end-ns', None),
1336+
('end-ns', None),
1337+
])
1338+
12781339
def test_events(self):
12791340
parser = ET.XMLPullParser(events=())
12801341
self._feed(parser, "<root/>\n")

Lib/xml/etree/ElementTree.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,6 +1518,10 @@ def __init__(self, *, target=None, encoding=None):
15181518
parser.StartElementHandler = self._start
15191519
if hasattr(target, 'end'):
15201520
parser.EndElementHandler = self._end
1521+
if hasattr(target, 'start_ns'):
1522+
parser.StartNamespaceDeclHandler = self._start_ns
1523+
if hasattr(target, 'end_ns'):
1524+
parser.EndNamespaceDeclHandler = self._end_ns
15211525
if hasattr(target, 'data'):
15221526
parser.CharacterDataHandler = target.data
15231527
# miscellaneous callbacks
@@ -1559,12 +1563,24 @@ def handler(tag, event=event_name, append=append,
15591563
append((event, end(tag)))
15601564
parser.EndElementHandler = handler
15611565
elif event_name == "start-ns":
1562-
def handler(prefix, uri, event=event_name, append=append):
1563-
append((event, (prefix or "", uri or "")))
1566+
# TreeBuilder does not implement .start_ns()
1567+
if hasattr(self.target, "start_ns"):
1568+
def handler(prefix, uri, event=event_name, append=append,
1569+
start_ns=self._start_ns):
1570+
append((event, start_ns(prefix, uri)))
1571+
else:
1572+
def handler(prefix, uri, event=event_name, append=append):
1573+
append((event, (prefix or '', uri or '')))
15641574
parser.StartNamespaceDeclHandler = handler
15651575
elif event_name == "end-ns":
1566-
def handler(prefix, event=event_name, append=append):
1567-
append((event, None))
1576+
# TreeBuilder does not implement .end_ns()
1577+
if hasattr(self.target, "end_ns"):
1578+
def handler(prefix, event=event_name, append=append,
1579+
end_ns=self._end_ns):
1580+
append((event, end_ns(prefix)))
1581+
else:
1582+
def handler(prefix, event=event_name, append=append):
1583+
append((event, None))
15681584
parser.EndNamespaceDeclHandler = handler
15691585
elif event_name == 'comment':
15701586
def handler(text, event=event_name, append=append, self=self):
@@ -1595,6 +1611,12 @@ def _fixname(self, key):
15951611
self._names[key] = name
15961612
return name
15971613

1614+
def _start_ns(self, prefix, uri):
1615+
return self.target.start_ns(prefix or '', uri or '')
1616+
1617+
def _end_ns(self, prefix):
1618+
return self.target.end_ns(prefix or '')
1619+
15981620
def _start(self, tag, attr_list):
15991621
# Handler for expat's StartElementHandler. Since ordered_attributes
16001622
# is set, the attributes are reported as a list of alternating
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the
2+
parser target if it defines the callback methods "start_ns()" and/or "end_ns()".
3+
Patch by Stefan Behnel.

Modules/_elementtree.c

Lines changed: 114 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
29112911
return NULL;
29122912
}
29132913

2914+
LOCAL(PyObject*)
2915+
treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri)
2916+
{
2917+
PyObject* parcel;
2918+
2919+
if (self->events_append && self->start_ns_event_obj) {
2920+
parcel = PyTuple_Pack(2, prefix, uri);
2921+
if (!parcel) {
2922+
return NULL;
2923+
}
2924+
2925+
if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) {
2926+
Py_DECREF(parcel);
2927+
return NULL;
2928+
}
2929+
Py_DECREF(parcel);
2930+
}
2931+
2932+
Py_RETURN_NONE;
2933+
}
2934+
2935+
LOCAL(PyObject*)
2936+
treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix)
2937+
{
2938+
if (self->events_append && self->end_ns_event_obj) {
2939+
if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) {
2940+
return NULL;
2941+
}
2942+
}
2943+
2944+
Py_RETURN_NONE;
2945+
}
2946+
29142947
/* -------------------------------------------------------------------- */
29152948
/* methods (in alphabetical order) */
29162949

@@ -3046,6 +3079,8 @@ typedef struct {
30463079

30473080
PyObject *names;
30483081

3082+
PyObject *handle_start_ns;
3083+
PyObject *handle_end_ns;
30493084
PyObject *handle_start;
30503085
PyObject *handle_data;
30513086
PyObject *handle_end;
@@ -3357,42 +3392,85 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in)
33573392
}
33583393

33593394
static void
3360-
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix,
3361-
const XML_Char *uri)
3395+
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in,
3396+
const XML_Char *uri_in)
33623397
{
3363-
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
3364-
PyObject *parcel;
3398+
PyObject* res = NULL;
3399+
PyObject* uri;
3400+
PyObject* prefix;
3401+
PyObject* stack[2];
33653402

33663403
if (PyErr_Occurred())
33673404
return;
33683405

3369-
if (!target->events_append || !target->start_ns_event_obj)
3370-
return;
3406+
if (!uri_in)
3407+
uri_in = "";
3408+
if (!prefix_in)
3409+
prefix_in = "";
3410+
3411+
if (TreeBuilder_CheckExact(self->target)) {
3412+
/* shortcut - TreeBuilder does not actually implement .start_ns() */
3413+
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
33713414

3372-
if (!uri)
3373-
uri = "";
3374-
if (!prefix)
3375-
prefix = "";
3415+
if (target->events_append && target->start_ns_event_obj) {
3416+
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
3417+
if (!prefix)
3418+
return;
3419+
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
3420+
if (!uri)
3421+
return;
33763422

3377-
parcel = Py_BuildValue("ss", prefix, uri);
3378-
if (!parcel)
3379-
return;
3380-
treebuilder_append_event(target, target->start_ns_event_obj, parcel);
3381-
Py_DECREF(parcel);
3423+
res = treebuilder_handle_start_ns(target, prefix, uri);
3424+
Py_DECREF(uri);
3425+
Py_DECREF(prefix);
3426+
}
3427+
} else if (self->handle_start_ns) {
3428+
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
3429+
if (!prefix)
3430+
return;
3431+
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
3432+
if (!uri)
3433+
return;
3434+
3435+
stack[0] = prefix;
3436+
stack[1] = uri;
3437+
res = _PyObject_FastCall(self->handle_start_ns, stack, 2);
3438+
Py_DECREF(uri);
3439+
Py_DECREF(prefix);
3440+
}
3441+
3442+
Py_XDECREF(res);
33823443
}
33833444

33843445
static void
33853446
expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
33863447
{
3387-
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
3448+
PyObject *res = NULL;
3449+
PyObject* prefix;
33883450

33893451
if (PyErr_Occurred())
33903452
return;
33913453

3392-
if (!target->events_append)
3393-
return;
3454+
if (!prefix_in)
3455+
prefix_in = "";
33943456

3395-
treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
3457+
if (TreeBuilder_CheckExact(self->target)) {
3458+
/* shortcut - TreeBuilder does not actually implement .end_ns() */
3459+
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
3460+
3461+
if (target->events_append && target->end_ns_event_obj) {
3462+
res = treebuilder_handle_end_ns(target, Py_None);
3463+
}
3464+
} else if (self->handle_end_ns) {
3465+
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
3466+
if (!prefix)
3467+
return;
3468+
3469+
res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1);
3470+
Py_DECREF(prefix);
3471+
}
3472+
3473+
Py_XDECREF(res);
33963474
}
33973475

33983476
static void
@@ -3546,6 +3624,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
35463624
if (self) {
35473625
self->parser = NULL;
35483626
self->target = self->entity = self->names = NULL;
3627+
self->handle_start_ns = self->handle_end_ns = NULL;
35493628
self->handle_start = self->handle_data = self->handle_end = NULL;
35503629
self->handle_comment = self->handle_pi = self->handle_close = NULL;
35513630
self->handle_doctype = NULL;
@@ -3614,6 +3693,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
36143693
}
36153694
self->target = target;
36163695

3696+
self->handle_start_ns = PyObject_GetAttrString(target, "start_ns");
3697+
if (ignore_attribute_error(self->handle_start_ns)) {
3698+
return -1;
3699+
}
3700+
self->handle_end_ns = PyObject_GetAttrString(target, "end_ns");
3701+
if (ignore_attribute_error(self->handle_end_ns)) {
3702+
return -1;
3703+
}
36173704
self->handle_start = PyObject_GetAttrString(target, "start");
36183705
if (ignore_attribute_error(self->handle_start)) {
36193706
return -1;
@@ -3645,6 +3732,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
36453732

36463733
/* configure parser */
36473734
EXPAT(SetUserData)(self->parser, self);
3735+
if (self->handle_start_ns || self->handle_end_ns)
3736+
EXPAT(SetNamespaceDeclHandler)(
3737+
self->parser,
3738+
(XML_StartNamespaceDeclHandler) expat_start_ns_handler,
3739+
(XML_EndNamespaceDeclHandler) expat_end_ns_handler
3740+
);
36483741
EXPAT(SetElementHandler)(
36493742
self->parser,
36503743
(XML_StartElementHandler) expat_start_handler,
@@ -3689,6 +3782,7 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg)
36893782
Py_VISIT(self->handle_end);
36903783
Py_VISIT(self->handle_data);
36913784
Py_VISIT(self->handle_start);
3785+
Py_VISIT(self->handle_start_ns);
36923786

36933787
Py_VISIT(self->target);
36943788
Py_VISIT(self->entity);
@@ -3712,6 +3806,7 @@ xmlparser_gc_clear(XMLParserObject *self)
37123806
Py_CLEAR(self->handle_end);
37133807
Py_CLEAR(self->handle_data);
37143808
Py_CLEAR(self->handle_start);
3809+
Py_CLEAR(self->handle_start_ns);
37153810
Py_CLEAR(self->handle_doctype);
37163811

37173812
Py_CLEAR(self->target);

0 commit comments

Comments
 (0)