Skip to content

Commit 5fdf7d4

Browse files
authored
Correctly parse raw script and style tags. (#1038)
* Ensure unclosed script tags are parsed correctly by providing a workaround for https://bugs.python.org/issue41989. * Avoid cdata_mode outside of HTML blocks, such as in inline code spans. Fixes #1036.
1 parent e02ed39 commit 5fdf7d4

File tree

3 files changed

+156
-0
lines changed

3 files changed

+156
-0
lines changed

docs/change_log/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Python-Markdown Change Log
55

66
Under development: version 3.3.1 (a bug-fix release).
77

8+
* Correctly parse raw `script` and `style` tags (#1036).
89
* Ensure consistent class handling by `fenced_code` and `codehilite` (#1032).
910

1011
Oct 6, 2020: version 3.3 ([Notes](release-3.3.md)).

markdown/htmlparser.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,13 @@ def reset(self):
7272
def close(self):
7373
"""Handle any buffered data."""
7474
super().close()
75+
if len(self.rawdata):
76+
# Temp fix for https://bugs.python.org/issue41989
77+
# TODO: remove this when the bug is fixed in all supported Python versions.
78+
if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
79+
self.handle_data(htmlparser.unescape(self.rawdata))
80+
else:
81+
self.handle_data(self.rawdata)
7582
# Handle any unclosed tags.
7683
if len(self._cache):
7784
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
@@ -124,6 +131,9 @@ def handle_starttag(self, tag, attrs):
124131
self._cache.append(text)
125132
else:
126133
self.cleandoc.append(text)
134+
if tag in self.CDATA_CONTENT_ELEMENTS:
135+
# This is presumably a standalone tag in a code span (see #1036).
136+
self.clear_cdata_mode()
127137

128138
def handle_endtag(self, tag):
129139
text = self.get_endtag_text(tag)
@@ -200,3 +210,63 @@ def handle_pi(self, data):
200210
def unknown_decl(self, data):
201211
end = ']]>' if data.startswith('CDATA[') else ']>'
202212
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
213+
214+
# The rest has been copied from base class in standard lib to address #1036.
215+
# As __startag_text is private, all references to it must be in this subclass.
216+
# The last few lines of parse_starttag are reversed so that handle_starttag
217+
# can override cdata_mode in certain situations (in a code span).
218+
__starttag_text = None
219+
220+
def get_starttag_text(self):
221+
"""Return full source of start tag: '<...>'."""
222+
return self.__starttag_text
223+
224+
def parse_starttag(self, i): # pragma: no cover
225+
self.__starttag_text = None
226+
endpos = self.check_for_whole_start_tag(i)
227+
if endpos < 0:
228+
return endpos
229+
rawdata = self.rawdata
230+
self.__starttag_text = rawdata[i:endpos]
231+
232+
# Now parse the data between i+1 and j into a tag and attrs
233+
attrs = []
234+
match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
235+
assert match, 'unexpected call to parse_starttag()'
236+
k = match.end()
237+
self.lasttag = tag = match.group(1).lower()
238+
while k < endpos:
239+
m = htmlparser.attrfind_tolerant.match(rawdata, k)
240+
if not m:
241+
break
242+
attrname, rest, attrvalue = m.group(1, 2, 3)
243+
if not rest:
244+
attrvalue = None
245+
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
246+
attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
247+
attrvalue = attrvalue[1:-1]
248+
if attrvalue:
249+
attrvalue = htmlparser.unescape(attrvalue)
250+
attrs.append((attrname.lower(), attrvalue))
251+
k = m.end()
252+
253+
end = rawdata[k:endpos].strip()
254+
if end not in (">", "/>"):
255+
lineno, offset = self.getpos()
256+
if "\n" in self.__starttag_text:
257+
lineno = lineno + self.__starttag_text.count("\n")
258+
offset = len(self.__starttag_text) \
259+
- self.__starttag_text.rfind("\n") # noqa: E127
260+
else:
261+
offset = offset + len(self.__starttag_text)
262+
self.handle_data(rawdata[i:endpos])
263+
return endpos
264+
if end.endswith('/>'):
265+
# XHTML-style empty tag: <span attr="value" />
266+
self.handle_startendtag(tag, attrs)
267+
else:
268+
# *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
269+
if tag in self.CDATA_CONTENT_ELEMENTS:
270+
self.set_cdata_mode(tag)
271+
self.handle_starttag(tag, attrs)
272+
return endpos

tests/test_syntax/blocks/test_html_blocks.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,3 +1317,88 @@ def text_invalid_tags(self):
13171317
"""
13181318
)
13191319
)
1320+
1321+
def test_script_tags(self):
1322+
self.assertMarkdownRenders(
1323+
self.dedent(
1324+
"""
1325+
<script>
1326+
*random stuff* <div> &amp;
1327+
</script>
1328+
1329+
<style>
1330+
**more stuff**
1331+
</style>
1332+
"""
1333+
),
1334+
self.dedent(
1335+
"""
1336+
<script>
1337+
*random stuff* <div> &amp;
1338+
</script>
1339+
1340+
<style>
1341+
**more stuff**
1342+
</style>
1343+
"""
1344+
)
1345+
)
1346+
1347+
def test_unclosed_script_tag(self):
1348+
# Ensure we have a working fix for https://bugs.python.org/issue41989
1349+
self.assertMarkdownRenders(
1350+
self.dedent(
1351+
"""
1352+
<script>
1353+
*random stuff* <div> &amp;
1354+
1355+
Still part of the *script* tag
1356+
"""
1357+
),
1358+
self.dedent(
1359+
"""
1360+
<script>
1361+
*random stuff* <div> &amp;
1362+
1363+
Still part of the *script* tag
1364+
"""
1365+
)
1366+
)
1367+
1368+
def test_inline_script_tags(self):
1369+
# Ensure inline script tags doesn't cause the parser to eat content (see #1036).
1370+
self.assertMarkdownRenders(
1371+
self.dedent(
1372+
"""
1373+
Text `<script>` more *text*.
1374+
1375+
<div>
1376+
*foo*
1377+
</div>
1378+
1379+
<div>
1380+
1381+
bar
1382+
1383+
</div>
1384+
1385+
A new paragraph with a closing `</script>` tag.
1386+
"""
1387+
),
1388+
self.dedent(
1389+
"""
1390+
<p>Text <code>&lt;script&gt;</code> more <em>text</em>.</p>
1391+
<div>
1392+
*foo*
1393+
</div>
1394+
1395+
<div>
1396+
1397+
bar
1398+
1399+
</div>
1400+
1401+
<p>A new paragraph with a closing <code>&lt;/script&gt;</code> tag.</p>
1402+
"""
1403+
)
1404+
)

0 commit comments

Comments
 (0)