👌 Handling of nested headers (#711)

chrisjsewell · web-flow · commit ac111ea6f585 · 2023-02-21T21:16:38.000+01:00
For the longest time, nested headers in myst-parser have been a pain, particularly in things like admonitions

In Markdown (and HTML) headings are allowed "anywhere", for example:

```markdown
&gt; # Heading 1
## Heading 2

Paragraph
```

Is rendered as:

```html
&lt;blockquote&gt;
    &lt;h1&gt;Heading 1&lt;/h1&gt;
&lt;/blockquote&gt;
&lt;h2&gt;Heading 2&lt;/h2&gt;
&lt;p&gt;Paragraph&lt;/p&gt;
```

However, because docutils/sphinx treats headers as nested sections, this becomes problematic

```xml
&lt;blockquote&gt;
   &lt;section&gt;
		&lt;title&gt;
            Heading 1
&lt;section&gt;
   &lt;title&gt;
      Heading 2
   &lt;paragraph&gt;
      Paragraph
```

Which sphinx cannot resolve the ToC tree from etc

This PR fixes this, by identifying if a heading is inside another component and instead outputting it as a "non-structural" rubric node

```xml
&lt;blockquote&gt;
   &lt;rubric level=1&gt;
		Heading 1
&lt;section&gt;
    &lt;title&gt;
      Heading 2
   &lt;paragraph&gt;
      Paragraph
```

Natively, docutils/sphinx does not deal with the "level" key in the rubric,
so here we also override the rubric HTML renderer to correctly output a `&lt;h&gt;` element, if "level" is present, to retrieve the desired:

```html
&lt;blockquote&gt;
    &lt;h1&gt;Heading 1&lt;/h1&gt;
&lt;/blockquote&gt;
&lt;h2&gt;Heading 2&lt;/h2&gt;
&lt;p&gt;Paragraph&lt;/p&gt;
```

There is no longer any warning of nested headers, since this is the intended behaviour

To clarify, the logic is now:

- A section can only be a child of the root document, or another section
- If a header token is encountered, with a child that is not one of these, then it is added as a rubric
- Otherwise a new section is created, and the heading is added as a title, which is a child of the section
diff --git a/myst_parser/mdit_to_docutils/base.py b/myst_parser/mdit_to_docutils/base.py
@@ -118,7 +118,7 @@ def __getattr__(self, name: str):
             "current_node",
             "reporter",
             "language_module_rst",
-            "_level_to_elem",
+            "_level_to_section",
         ):
             raise AttributeError(
                 f"'{name}' attribute is not available until setup_render() is called"
@@ -143,7 +143,7 @@ def setup_render(
             self.document.settings.language_code
         )
         # a mapping of heading levels to its currently associated node
-        self._level_to_elem: dict[int, nodes.document | nodes.section] = {
+        self._level_to_section: dict[int, nodes.document | nodes.section] = {
             0: self.document
         }
         # mapping of section slug to section node
@@ -321,14 +321,18 @@ def _render_finalise(self) -> None:
                 )
 
     def nested_render_text(
-        self, text: str, lineno: int, inline: bool = False, allow_headings: bool = True
+        self,
+        text: str,
+        lineno: int,
+        inline: bool = False,
+        temp_root_node: None | nodes.Element = None,
     ) -> None:
         """Render unparsed text (appending to the current node).
 
         :param text: the text to render
         :param lineno: the starting line number of the text, within the full source
         :param inline: whether the text is inline or block
-        :param allow_headings: whether to allow headings in the text
+        :param temp_root_node: If set, allow sections to be created as children of this node
         """
         tokens = (
             self.md.parseInline(text, self.md_env)
@@ -345,12 +349,21 @@ def nested_render_text(
             if token.map:
                 token.map = [token.map[0] + lineno, token.map[1] + lineno]
 
-        current_match_titles = self.md_env.get("match_titles", None)
-        try:
-            self.md_env["match_titles"] = allow_headings
+        if temp_root_node is None:
             self._render_tokens(tokens)
-        finally:
-            self.md_env["match_titles"] = current_match_titles
+        else:
+            # we need to temporarily set the root node,
+            # and we also want to restore the level_to_section mapping at the end
+            current_level_to_section = {
+                i: node for i, node in self._level_to_section.items()
+            }
+            current_root_node = self.md_env.get("temp_root_node", None)
+            try:
+                self.md_env["temp_root_node"] = temp_root_node
+                self._render_tokens(tokens)
+            finally:
+                self.md_env["temp_root_node"] = current_root_node
+                self._level_to_section = current_level_to_section
 
     @contextmanager
     def current_node_context(
@@ -444,10 +457,10 @@ def update_section_level_state(self, section: nodes.section, level: int) -> None
         # find the closest parent section
         parent_level = max(
             section_level
-            for section_level in self._level_to_elem
+            for section_level in self._level_to_section
             if level > section_level
         )
-        parent = self._level_to_elem[parent_level]
+        parent = self._level_to_section[parent_level]
 
         # if we are jumping up to a non-consecutive level,
         # then warn about this, since this will not be propagated in the docutils AST
@@ -465,12 +478,12 @@ def update_section_level_state(self, section: nodes.section, level: int) -> None
         # append the new section to the parent
         parent.append(section)
         # update the state for this section level
-        self._level_to_elem[level] = section
+        self._level_to_section[level] = section
 
         # Remove all descendant sections from the section level state
-        self._level_to_elem = {
+        self._level_to_section = {
             section_level: section
-            for section_level, section in self._level_to_elem.items()
+            for section_level, section in self._level_to_section.items()
             if section_level <= level
         }
 
@@ -769,30 +782,28 @@ def blocks_mathjax_processing(self) -> bool:
     def render_heading(self, token: SyntaxTreeNode) -> None:
         """Render a heading, e.g. `# Heading`."""
 
-        if (
-            token.attrs.get("toc", None) == "false"
-            or self.md_env.get("match_titles", None) is False
-        ):
-            if token.attrs.get("toc", None) != "false":
-                # this can occur if a nested parse is performed by a directive
-                # (such as an admonition) which contains a header.
-                # this would break the document structure
-                self.create_warning(
-                    "Disallowed nested header found, converting to rubric",
-                    MystWarnings.MD_HEADING_NESTED,
-                    line=token_line(token, default=0),
-                    append_to=self.current_node,
-                )
+        level = int(token.tag[1])
 
-            rubric = nodes.rubric(token.content, "")
+        # sections are only allowed as a parent of a document or another section
+        # the only exception to this, is if a directive has called a nested parse,
+        # and specifically specified that sections are allowed to be created as children
+        # of its root node (a.k.a match_titles=True)
+        parent_of_temp_root = (
+            self.md_env.get("temp_root_node", None) is not None
+            and self.current_node == self.md_env["temp_root_node"]
+        )
+        if not (
+            parent_of_temp_root
+            or isinstance(self.current_node, (nodes.document, nodes.section))
+        ):
+            # if this is not the case, we create a rubric node instead
+            rubric = nodes.rubric(token.content, "", level=level)
             self.add_line_and_source_path(rubric, token)
             self.copy_attributes(token, rubric, ("class", "id"))
             with self.current_node_context(rubric, append=True):
                 self.render_children(token)
             return
 
-        level = int(token.tag[1])
-
         # create the section node
         new_section = nodes.section()
         self.add_line_and_source_path(new_section, token)
@@ -1769,7 +1780,7 @@ def render_substitution(self, token: SyntaxTreeNode, inline: bool) -> None:
             if inline and not REGEX_DIRECTIVE_START.match(rendered):
                 self.nested_render_text(rendered, position, inline=True)
             else:
-                self.nested_render_text(rendered, position, allow_headings=False)
+                self.nested_render_text(rendered, position)
         finally:
             self.document.sub_references.difference_update(references)
 
diff --git a/myst_parser/mocking.py b/myst_parser/mocking.py
@@ -118,7 +118,7 @@ class Struct:
             reporter = self.document.reporter
             language = renderer.language_module_rst
             title_styles: list[str] = []
-            section_level = max(renderer._level_to_elem)
+            section_level = max(renderer._level_to_section)
             section_bubble_up_kludge = False
             inliner = self.inliner
 
@@ -174,7 +174,7 @@ def nested_parse(
             self._renderer.nested_render_text(
                 "\n".join(block),
                 self._lineno + input_offset,
-                allow_headings=match_titles,
+                temp_root_node=node if match_titles else None,
             )
         self.state_machine.match_titles = sm_match_titles
 
@@ -469,9 +469,7 @@ def run(self) -> list[nodes.Element]:
                     source_dir,
                     path.parent,
                 )
-            self.renderer.nested_render_text(
-                file_content, startline + 1, allow_headings=True
-            )
+            self.renderer.nested_render_text(file_content, startline + 1)
         finally:
             self.renderer.document["source"] = source
             self.renderer.reporter.source = rsource
diff --git a/myst_parser/parsers/docutils_.py b/myst_parser/parsers/docutils_.py
@@ -257,6 +257,10 @@ def parse(self, inputstring: str, document: nodes.document) -> None:
         :param inputstring: The source string to parse
         :param document: The root docutils node to add AST elements to
         """
+        from docutils.writers._html_base import HTMLTranslator
+
+        HTMLTranslator.visit_rubric = visit_rubric_html
+        HTMLTranslator.depart_rubric = depart_rubric_html
 
         self.setup_parse(inputstring, document)
 
@@ -350,3 +354,75 @@ def cli_xml(argv: Optional[List[str]] = None):
 def cli_pseudoxml(argv: Optional[List[str]] = None):
     """Cmdline entrypoint for converting MyST to pseudo-XML."""
     _run_cli("pseudoxml", "pseudo-XML", argv)
+
+
+def visit_rubric_html(self, node):
+    """Override the default HTML visit method for rubric nodes.
+
+    docutils structures a document, based on the headings, into nested sections::
+
+        # h1
+        ## h2
+        ### h3
+
+        <section>
+            <title>
+                h1
+            <section>
+                <title>
+                    h2
+                <section>
+                    <title>
+                        h3
+
+    This means that it is not possible to have "standard" headings nested inside
+    other components, such as blockquotes, because it would break the structure::
+
+        # h1
+        > ## h2
+        ### h3
+
+        <section>
+            <title>
+                h1
+            <blockquote>
+                <section>
+                    <title>
+                        h2
+            <section>
+                <title>
+                    h3
+
+    we work around this shortcoming, in `DocutilsRenderer.render_heading`,
+    by identifying if a heading is inside another component
+    and instead outputting it as a "non-structural" rubric node, and capture the level::
+
+        <section>
+            <title>
+                h1
+            <blockquote>
+                <rubric level=2>
+                    h2
+            <section>
+                <title>
+                    h3
+
+    However, docutils natively just outputs rubrics as <p> tags,
+    and does not "honor" the heading level.
+    So here we override the visit/depart methods to output the correct <h> element
+    """
+    if "level" in node:
+        self.body.append(self.starttag(node, f'h{node["level"]}', "", CLASS="rubric"))
+    else:
+        self.body.append(self.starttag(node, "p", "", CLASS="rubric"))
+
+
+def depart_rubric_html(self, node):
+    """Override the default HTML visit method for rubric nodes.
+
+    See explanation in `visit_rubric_html`
+    """
+    if "level" in node:
+        self.body.append(f'</h{node["level"]}>\n')
+    else:
+        self.body.append("</p>\n")
diff --git a/myst_parser/sphinx_ext/main.py b/myst_parser/sphinx_ext/main.py
@@ -1,8 +1,10 @@
 """The setup for the sphinx extension."""
 from typing import Any
 
+from docutils import nodes
 from sphinx.application import Sphinx
 
+from myst_parser.parsers.docutils_ import depart_rubric_html, visit_rubric_html
 from myst_parser.warnings_ import MystWarnings
 
 
@@ -28,6 +30,11 @@ def setup_sphinx(app: Sphinx, load_parser=False):
 
     app.add_post_transform(MystReferenceResolver)
 
+    # override only the html writer visit methods for rubric, to use the "level" attribute
+    app.add_node(
+        nodes.rubric, override=True, html=(visit_rubric_html, depart_rubric_html)
+    )
+
     for name, default, field in MdParserConfig().as_triple():
         if "sphinx" not in field.metadata.get("omit", []):
             # TODO add types?
diff --git a/myst_parser/warnings_.py b/myst_parser/warnings_.py
@@ -26,8 +26,6 @@ class MystWarnings(Enum):
     """Missing Markdown footnote definition."""
     MD_HEADING_NON_CONSECUTIVE = "header"
     """Non-consecutive heading levels."""
-    MD_HEADING_NESTED = "nested_header"
-    """Header found nested in another element."""
 
     DIRECTIVE_PARSING = "directive_parse"
     """Issue parsing directive."""
diff --git a/tests/test_renderers/fixtures/docutil_syntax_elements.md b/tests/test_renderers/fixtures/docutil_syntax_elements.md
@@ -111,6 +111,16 @@ Heading Levels:
             d
 .
 
+Nested heading
+.
+> # heading
+.
+<document source="notset">
+    <block_quote>
+        <rubric level="1">
+            heading
+.
+
 Block Code:
 .
     foo
diff --git a/tests/test_renderers/fixtures/reporter_warnings.md b/tests/test_renderers/fixtures/reporter_warnings.md
@@ -160,7 +160,7 @@ header nested in admonition
 # Header
 ```
 .
-<string>:2: (WARNING/2) Disallowed nested header found, converting to rubric [myst.nested_header]
+
 .
 
 nested parse warning
diff --git a/tests/test_renderers/fixtures/sphinx_syntax_elements.md b/tests/test_renderers/fixtures/sphinx_syntax_elements.md
@@ -111,6 +111,16 @@ Heading Levels:
             d
 .
 
+Nested heading
+.
+> # heading
+.
+<document source="<src>/index.md">
+    <block_quote>
+        <rubric level="1">
+            heading
+.
+
 Block Code:
 .
     foo

Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,16 @@ Heading Levels:`
`111`	`111`	`d`
`112`	`112`	`.`
`113`	`113`
	`114`	`+Nested heading`
	`115`	`+.`
	`116`	`+> # heading`
	`117`	`+.`
	`118`	`+<document source="notset">`
	`119`	`+ <block_quote>`
	`120`	`+ <rubric level="1">`
	`121`	`+ heading`
	`122`	`+.`
	`123`	`+`
`114`	`124`	`Block Code:`
`115`	`125`	`.`
`116`	`126`	`foo`
Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ header nested in admonition`
`160`	`160`	`# Header`
`161`	`161`	```
`162`	`162`	`.`
`163`		`-<string>:2: (WARNING/2) Disallowed nested header found, converting to rubric [myst.nested_header]`
	`163`	`+`
`164`	`164`	`.`
`165`	`165`
`166`	`166`	`nested parse warning`