Skip to content

Commit 4088474

Browse files
committed
Added parse_html option in parse_node
1 parent 67d8fec commit 4088474

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

scrapegraphai/nodes/parse_node.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def __init__(self, input: str, output: List[str], node_config: dict, node_name:
3030
super().__init__(node_name, "node", input, output, 1, node_config)
3131

3232
self.verbose = True if node_config is None else node_config.get("verbose", False)
33+
self.parse_html = True if node_config is None else node_config.get("parse_html", True)
3334

3435
def execute(self, state: dict) -> dict:
3536
"""
@@ -62,8 +63,11 @@ def execute(self, state: dict) -> dict:
6263
)
6364

6465
# Parse the document
65-
docs_transformed = Html2TextTransformer(
66-
).transform_documents(input_data[0])[0]
66+
docs_transformed = input_data[0]
67+
if self.parse_html:
68+
docs_transformed = Html2TextTransformer(
69+
).transform_documents(input_data[0])
70+
docs_transformed = docs_transformed[0]
6771

6872
chunks = text_splitter.split_text(docs_transformed.page_content)
6973

0 commit comments

Comments
 (0)