Skip to content

Commit ac111ea

Browse files
authored
👌 Handling of nested headers (#711)
For the longest time, nested headers in myst-parser have been a pain, particularly in things like admonitions In Markdown (and HTML) headings are allowed "anywhere", for example: ```markdown > # Heading 1 ## Heading 2 Paragraph ``` Is rendered as: ```html <blockquote> <h1>Heading 1</h1> </blockquote> <h2>Heading 2</h2> <p>Paragraph</p> ``` However, because docutils/sphinx treats headers as nested sections, this becomes problematic ```xml <blockquote> <section> <title> Heading 1 <section> <title> Heading 2 <paragraph> Paragraph ``` Which sphinx cannot resolve the ToC tree from etc This PR fixes this, by identifying if a heading is inside another component and instead outputting it as a "non-structural" rubric node ```xml <blockquote> <rubric level=1> Heading 1 <section> <title> Heading 2 <paragraph> Paragraph ``` Natively, docutils/sphinx does not deal with the "level" key in the rubric, so here we also override the rubric HTML renderer to correctly output a `<h>` element, if "level" is present, to retrieve the desired: ```html <blockquote> <h1>Heading 1</h1> </blockquote> <h2>Heading 2</h2> <p>Paragraph</p> ``` There is no longer any warning of nested headers, since this is the intended behaviour To clarify, the logic is now: - A section can only be a child of the root document, or another section - If a header token is encountered, with a child that is not one of these, then it is added as a rubric - Otherwise a new section is created, and the heading is added as a title, which is a child of the section
1 parent 1096e52 commit ac111ea

File tree

8 files changed

+150
-40
lines changed

8 files changed

+150
-40
lines changed

myst_parser/mdit_to_docutils/base.py

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def __getattr__(self, name: str):
118118
"current_node",
119119
"reporter",
120120
"language_module_rst",
121-
"_level_to_elem",
121+
"_level_to_section",
122122
):
123123
raise AttributeError(
124124
f"'{name}' attribute is not available until setup_render() is called"
@@ -143,7 +143,7 @@ def setup_render(
143143
self.document.settings.language_code
144144
)
145145
# a mapping of heading levels to its currently associated node
146-
self._level_to_elem: dict[int, nodes.document | nodes.section] = {
146+
self._level_to_section: dict[int, nodes.document | nodes.section] = {
147147
0: self.document
148148
}
149149
# mapping of section slug to section node
@@ -321,14 +321,18 @@ def _render_finalise(self) -> None:
321321
)
322322

323323
def nested_render_text(
324-
self, text: str, lineno: int, inline: bool = False, allow_headings: bool = True
324+
self,
325+
text: str,
326+
lineno: int,
327+
inline: bool = False,
328+
temp_root_node: None | nodes.Element = None,
325329
) -> None:
326330
"""Render unparsed text (appending to the current node).
327331
328332
:param text: the text to render
329333
:param lineno: the starting line number of the text, within the full source
330334
:param inline: whether the text is inline or block
331-
:param allow_headings: whether to allow headings in the text
335+
:param temp_root_node: If set, allow sections to be created as children of this node
332336
"""
333337
tokens = (
334338
self.md.parseInline(text, self.md_env)
@@ -345,12 +349,21 @@ def nested_render_text(
345349
if token.map:
346350
token.map = [token.map[0] + lineno, token.map[1] + lineno]
347351

348-
current_match_titles = self.md_env.get("match_titles", None)
349-
try:
350-
self.md_env["match_titles"] = allow_headings
352+
if temp_root_node is None:
351353
self._render_tokens(tokens)
352-
finally:
353-
self.md_env["match_titles"] = current_match_titles
354+
else:
355+
# we need to temporarily set the root node,
356+
# and we also want to restore the level_to_section mapping at the end
357+
current_level_to_section = {
358+
i: node for i, node in self._level_to_section.items()
359+
}
360+
current_root_node = self.md_env.get("temp_root_node", None)
361+
try:
362+
self.md_env["temp_root_node"] = temp_root_node
363+
self._render_tokens(tokens)
364+
finally:
365+
self.md_env["temp_root_node"] = current_root_node
366+
self._level_to_section = current_level_to_section
354367

355368
@contextmanager
356369
def current_node_context(
@@ -444,10 +457,10 @@ def update_section_level_state(self, section: nodes.section, level: int) -> None
444457
# find the closest parent section
445458
parent_level = max(
446459
section_level
447-
for section_level in self._level_to_elem
460+
for section_level in self._level_to_section
448461
if level > section_level
449462
)
450-
parent = self._level_to_elem[parent_level]
463+
parent = self._level_to_section[parent_level]
451464

452465
# if we are jumping up to a non-consecutive level,
453466
# then warn about this, since this will not be propagated in the docutils AST
@@ -465,12 +478,12 @@ def update_section_level_state(self, section: nodes.section, level: int) -> None
465478
# append the new section to the parent
466479
parent.append(section)
467480
# update the state for this section level
468-
self._level_to_elem[level] = section
481+
self._level_to_section[level] = section
469482

470483
# Remove all descendant sections from the section level state
471-
self._level_to_elem = {
484+
self._level_to_section = {
472485
section_level: section
473-
for section_level, section in self._level_to_elem.items()
486+
for section_level, section in self._level_to_section.items()
474487
if section_level <= level
475488
}
476489

@@ -769,30 +782,28 @@ def blocks_mathjax_processing(self) -> bool:
769782
def render_heading(self, token: SyntaxTreeNode) -> None:
770783
"""Render a heading, e.g. `# Heading`."""
771784

772-
if (
773-
token.attrs.get("toc", None) == "false"
774-
or self.md_env.get("match_titles", None) is False
775-
):
776-
if token.attrs.get("toc", None) != "false":
777-
# this can occur if a nested parse is performed by a directive
778-
# (such as an admonition) which contains a header.
779-
# this would break the document structure
780-
self.create_warning(
781-
"Disallowed nested header found, converting to rubric",
782-
MystWarnings.MD_HEADING_NESTED,
783-
line=token_line(token, default=0),
784-
append_to=self.current_node,
785-
)
785+
level = int(token.tag[1])
786786

787-
rubric = nodes.rubric(token.content, "")
787+
# sections are only allowed as a parent of a document or another section
788+
# the only exception to this, is if a directive has called a nested parse,
789+
# and specifically specified that sections are allowed to be created as children
790+
# of its root node (a.k.a match_titles=True)
791+
parent_of_temp_root = (
792+
self.md_env.get("temp_root_node", None) is not None
793+
and self.current_node == self.md_env["temp_root_node"]
794+
)
795+
if not (
796+
parent_of_temp_root
797+
or isinstance(self.current_node, (nodes.document, nodes.section))
798+
):
799+
# if this is not the case, we create a rubric node instead
800+
rubric = nodes.rubric(token.content, "", level=level)
788801
self.add_line_and_source_path(rubric, token)
789802
self.copy_attributes(token, rubric, ("class", "id"))
790803
with self.current_node_context(rubric, append=True):
791804
self.render_children(token)
792805
return
793806

794-
level = int(token.tag[1])
795-
796807
# create the section node
797808
new_section = nodes.section()
798809
self.add_line_and_source_path(new_section, token)
@@ -1769,7 +1780,7 @@ def render_substitution(self, token: SyntaxTreeNode, inline: bool) -> None:
17691780
if inline and not REGEX_DIRECTIVE_START.match(rendered):
17701781
self.nested_render_text(rendered, position, inline=True)
17711782
else:
1772-
self.nested_render_text(rendered, position, allow_headings=False)
1783+
self.nested_render_text(rendered, position)
17731784
finally:
17741785
self.document.sub_references.difference_update(references)
17751786

myst_parser/mocking.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ class Struct:
118118
reporter = self.document.reporter
119119
language = renderer.language_module_rst
120120
title_styles: list[str] = []
121-
section_level = max(renderer._level_to_elem)
121+
section_level = max(renderer._level_to_section)
122122
section_bubble_up_kludge = False
123123
inliner = self.inliner
124124

@@ -174,7 +174,7 @@ def nested_parse(
174174
self._renderer.nested_render_text(
175175
"\n".join(block),
176176
self._lineno + input_offset,
177-
allow_headings=match_titles,
177+
temp_root_node=node if match_titles else None,
178178
)
179179
self.state_machine.match_titles = sm_match_titles
180180

@@ -469,9 +469,7 @@ def run(self) -> list[nodes.Element]:
469469
source_dir,
470470
path.parent,
471471
)
472-
self.renderer.nested_render_text(
473-
file_content, startline + 1, allow_headings=True
474-
)
472+
self.renderer.nested_render_text(file_content, startline + 1)
475473
finally:
476474
self.renderer.document["source"] = source
477475
self.renderer.reporter.source = rsource

myst_parser/parsers/docutils_.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,10 @@ def parse(self, inputstring: str, document: nodes.document) -> None:
257257
:param inputstring: The source string to parse
258258
:param document: The root docutils node to add AST elements to
259259
"""
260+
from docutils.writers._html_base import HTMLTranslator
261+
262+
HTMLTranslator.visit_rubric = visit_rubric_html
263+
HTMLTranslator.depart_rubric = depart_rubric_html
260264

261265
self.setup_parse(inputstring, document)
262266

@@ -350,3 +354,75 @@ def cli_xml(argv: Optional[List[str]] = None):
350354
def cli_pseudoxml(argv: Optional[List[str]] = None):
351355
"""Cmdline entrypoint for converting MyST to pseudo-XML."""
352356
_run_cli("pseudoxml", "pseudo-XML", argv)
357+
358+
359+
def visit_rubric_html(self, node):
360+
"""Override the default HTML visit method for rubric nodes.
361+
362+
docutils structures a document, based on the headings, into nested sections::
363+
364+
# h1
365+
## h2
366+
### h3
367+
368+
<section>
369+
<title>
370+
h1
371+
<section>
372+
<title>
373+
h2
374+
<section>
375+
<title>
376+
h3
377+
378+
This means that it is not possible to have "standard" headings nested inside
379+
other components, such as blockquotes, because it would break the structure::
380+
381+
# h1
382+
> ## h2
383+
### h3
384+
385+
<section>
386+
<title>
387+
h1
388+
<blockquote>
389+
<section>
390+
<title>
391+
h2
392+
<section>
393+
<title>
394+
h3
395+
396+
we work around this shortcoming, in `DocutilsRenderer.render_heading`,
397+
by identifying if a heading is inside another component
398+
and instead outputting it as a "non-structural" rubric node, and capture the level::
399+
400+
<section>
401+
<title>
402+
h1
403+
<blockquote>
404+
<rubric level=2>
405+
h2
406+
<section>
407+
<title>
408+
h3
409+
410+
However, docutils natively just outputs rubrics as <p> tags,
411+
and does not "honor" the heading level.
412+
So here we override the visit/depart methods to output the correct <h> element
413+
"""
414+
if "level" in node:
415+
self.body.append(self.starttag(node, f'h{node["level"]}', "", CLASS="rubric"))
416+
else:
417+
self.body.append(self.starttag(node, "p", "", CLASS="rubric"))
418+
419+
420+
def depart_rubric_html(self, node):
421+
"""Override the default HTML visit method for rubric nodes.
422+
423+
See explanation in `visit_rubric_html`
424+
"""
425+
if "level" in node:
426+
self.body.append(f'</h{node["level"]}>\n')
427+
else:
428+
self.body.append("</p>\n")

myst_parser/sphinx_ext/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
"""The setup for the sphinx extension."""
22
from typing import Any
33

4+
from docutils import nodes
45
from sphinx.application import Sphinx
56

7+
from myst_parser.parsers.docutils_ import depart_rubric_html, visit_rubric_html
68
from myst_parser.warnings_ import MystWarnings
79

810

@@ -28,6 +30,11 @@ def setup_sphinx(app: Sphinx, load_parser=False):
2830

2931
app.add_post_transform(MystReferenceResolver)
3032

33+
# override only the html writer visit methods for rubric, to use the "level" attribute
34+
app.add_node(
35+
nodes.rubric, override=True, html=(visit_rubric_html, depart_rubric_html)
36+
)
37+
3138
for name, default, field in MdParserConfig().as_triple():
3239
if "sphinx" not in field.metadata.get("omit", []):
3340
# TODO add types?

myst_parser/warnings_.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ class MystWarnings(Enum):
2626
"""Missing Markdown footnote definition."""
2727
MD_HEADING_NON_CONSECUTIVE = "header"
2828
"""Non-consecutive heading levels."""
29-
MD_HEADING_NESTED = "nested_header"
30-
"""Header found nested in another element."""
3129

3230
DIRECTIVE_PARSING = "directive_parse"
3331
"""Issue parsing directive."""

tests/test_renderers/fixtures/docutil_syntax_elements.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,16 @@ Heading Levels:
111111
d
112112
.
113113

114+
Nested heading
115+
.
116+
> # heading
117+
.
118+
<document source="notset">
119+
<block_quote>
120+
<rubric level="1">
121+
heading
122+
.
123+
114124
Block Code:
115125
.
116126
foo

tests/test_renderers/fixtures/reporter_warnings.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ header nested in admonition
160160
# Header
161161
```
162162
.
163-
<string>:2: (WARNING/2) Disallowed nested header found, converting to rubric [myst.nested_header]
163+
164164
.
165165

166166
nested parse warning

tests/test_renderers/fixtures/sphinx_syntax_elements.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,16 @@ Heading Levels:
111111
d
112112
.
113113

114+
Nested heading
115+
.
116+
> # heading
117+
.
118+
<document source="<src>/index.md">
119+
<block_quote>
120+
<rubric level="1">
121+
heading
122+
.
123+
114124
Block Code:
115125
.
116126
foo

0 commit comments

Comments
 (0)