Skip to content

Commit 03b60ae

Browse files
committed
Add flag for force creating page
1 parent d09ee4e commit 03b60ae

File tree

3 files changed

+17
-6
lines changed

3 files changed

+17
-6
lines changed

docs/conversion.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,11 @@ HTML->JSON-DOC tasks
125125
- [x] Convert line breaks `<br>`
126126
- [x] Convert `<caption>` and `<figcaption>`
127127
- [x] Force_page=true
128+
- [x] Add an argument to CLI for force-creating a page
129+
- for some reason, pandoc doesn't create a top-level `<html>` and `<body>` when converting from docx to HTML
128130
- [ ] Residual strings, newlines or empty paragraphs in the final output list (in progress)
129131
- [ ] Make sure `<a>` conversion is consistent
130132
- [ ] Cleanup empty blocks at the end
131133
- [ ] Table cells with colspan/rowspan
132134
- [ ] Add test for `<code>` and `<pre>`
133-
- [ ] Table thead/tbody/tfoot ordering
135+
- [ ] Table thead/tbody/tfoot ordering

jsondoc/bin/convert_jsondoc.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def convert_to_jsondoc(
6565
indent: int | None = None,
6666
source_format: str | None = None,
6767
target_format: str | None = None,
68+
force_page: bool = False,
6869
):
6970
"""
7071
Convert to and from JSON-DOC format.
@@ -127,6 +128,7 @@ def convert_to_jsondoc(
127128
input_content if input_content is not None else input_file,
128129
"html",
129130
format=source_format,
131+
extra_args=["--wrap=none"],
130132
)
131133
except RuntimeError as e:
132134
# Handle different error message from Pandoc
@@ -143,7 +145,7 @@ def convert_to_jsondoc(
143145
else:
144146
raise e
145147

146-
jsondoc = html_to_jsondoc(html_content)
148+
jsondoc = html_to_jsondoc(html_content, force_page=force_page)
147149

148150
# Serialize the jsondoc
149151
serialized_jsondoc = jsondoc_dump_json(jsondoc, indent=indent)
@@ -189,6 +191,12 @@ def main():
189191
help="Number of spaces for indentation in the output JSON file",
190192
default=None,
191193
)
194+
parser.add_argument(
195+
"--force-page",
196+
action="store_true",
197+
help="Force the creation of a page even if the input doesn't "
198+
"contain a top-level HTML structure",
199+
)
192200
args = parser.parse_args()
193201

194202
try:
@@ -198,6 +206,7 @@ def main():
198206
indent=args.indent,
199207
source_format=args.source_format,
200208
target_format=args.target_format,
209+
force_page=args.force_page,
201210
)
202211
except (ValueError, RuntimeError) as e:
203212
print(e)

jsondoc/convert/html.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class ConvertOutput(BaseModel):
6464
and then we concatenate the result with prev_objects and next_objects:
6565
>>> final_objects = prev_objects + reconciled_objects + next_objects
6666
"""
67+
6768
main_object: BlockBase | RichTextBase
6869
prev_objects: List[BlockBase | RichTextBase] = []
6970
next_objects: List[BlockBase | RichTextBase] = []
@@ -359,6 +360,7 @@ class DefaultOptions:
359360
default_title = False
360361
keep_inline_images_in = []
361362
strip = None
363+
force_page = False
362364

363365
class Options(DefaultOptions):
364366
pass
@@ -379,16 +381,14 @@ def convert(self, html: str | bytes) -> Page | BlockBase | List[BlockBase]:
379381
soup = BeautifulSoup(html, "html.parser")
380382
return self.convert_soup(soup)
381383

382-
def convert_soup(
383-
self, soup: BeautifulSoup, force_page=False
384-
) -> Page | BlockBase | List[BlockBase]:
384+
def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase]:
385385

386386
children = self.process_tag(soup, convert_as_inline=False, children_only=True)
387387
children = run_final_block_transformations(children)
388388
is_page = self._is_soup_page(soup)
389389

390390
ret = None
391-
if is_page or force_page:
391+
if is_page or self.options["force_page"]:
392392
title = self._get_html_title(soup)
393393
# Ensure that children is a list
394394
if not isinstance(children, list):

0 commit comments

Comments
 (0)