Skip to content

Commit e6c7940

Browse files
committed
feat: add Parse_Node
1 parent 79b8326 commit e6c7940

File tree

3 files changed

+29
-8
lines changed

3 files changed

+29
-8
lines changed

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from ..nodes import (
1313
FetchNode,
14+
ParseNode,
1415
RAGNode,
1516
GenerateAnswerPDFNode
1617
)
@@ -66,6 +67,15 @@ def _create_graph(self) -> BaseGraph:
6667
output=["doc"],
6768
)
6869

70+
parse_node = ParseNode(
71+
input="doc",
72+
output=["parsed_doc"],
73+
node_config={
74+
"parse_html": False,
75+
"chunk_size": self.model_token
76+
}
77+
)
78+
6979
rag_node = RAGNode(
7080
input="user_prompt & (parsed_doc | doc)",
7181
output=["relevant_chunks"],
@@ -86,11 +96,13 @@ def _create_graph(self) -> BaseGraph:
8696
return BaseGraph(
8797
nodes=[
8898
fetch_node,
99+
parse_node,
89100
rag_node,
90101
generate_answer_node_pdf,
91102
],
92103
edges=[
93-
(fetch_node, rag_node),
104+
(fetch_node, parse_node),
105+
(parse_node, rag_node),
94106
(rag_node, generate_answer_node_pdf)
95107
],
96108
entry_point=fetch_node

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
"""
44

55
from typing import Optional
6+
import logging
67
from pydantic import BaseModel
7-
88
from .base_graph import BaseGraph
99
from .abstract_graph import AbstractGraph
1010

@@ -70,6 +70,7 @@ def _create_graph(self) -> BaseGraph:
7070
}
7171
)
7272
logging.info("FetchNode configured with headless: %s", self.config.get("headless", True))
73+
7374
parse_node = ParseNode(
7475
input="doc",
7576
output=["parsed_doc"],

scrapegraphai/nodes/parse_node.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,20 @@ def execute(self, state: dict) -> dict:
7070
docs_transformed = input_data[0]
7171
if self.parse_html:
7272
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
73-
docs_transformed = docs_transformed[0]
74-
75-
chunks = chunk(text=docs_transformed.page_content,
76-
chunk_size= self.node_config.get("chunk_size", 4096),
77-
token_counter=lambda x: len(x.split()),
78-
memoize=False)
73+
docs_transformed = docs_transformed[0]
74+
75+
chunks = chunk(text=docs_transformed.page_content,
76+
chunk_size= self.node_config.get("chunk_size", 4096),
77+
token_counter=lambda x: len(x.split()),
78+
memoize=False)
79+
else:
80+
docs_transformed = docs_transformed[0]
81+
82+
chunks = chunk(text=docs_transformed,
83+
chunk_size= self.node_config.get("chunk_size", 4096),
84+
token_counter=lambda x: len(x.split()),
85+
memoize=False)
86+
7987
state.update({self.output[0]: chunks})
8088

8189
return state

0 commit comments

Comments
 (0)