Skip to content

Commit 07f1e23

Browse files
committed
fix: parse_node
1 parent 68f58cc commit 07f1e23

File tree

4 files changed

+28
-16
lines changed

4 files changed

+28
-16
lines changed

examples/local_models/smart_scraper_ollama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
smart_scraper_graph = SmartScraperGraph(
3131
prompt="List me all the titles",
32-
source="https://sport.sky.it/nba?gr=www",
32+
source="https://perinim.github.io/projects",
3333
config=graph_config
3434
)
3535

scrapegraphai/nodes/base_node.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ def update_config(self, params: dict, overwrite: bool = False):
8888
param (dict): The dictionary to update node_config with.
8989
overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None.
9090
"""
91-
9291
for key, val in params.items():
9392
if hasattr(self, key) and not overwrite:
9493
continue

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def execute(self, state: dict) -> dict:
121121
answer = chain.invoke({"question": user_prompt})
122122
break
123123

124-
prompt = PromptTemplate(
124+
prompt = PromptTemplate(
125125
template=template_chunks_prompt,
126126
input_variables=["question"],
127127
partial_variables={"context": chunk,

scrapegraphai/nodes/parse_node.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,35 +50,48 @@ def execute(self, state: dict) -> dict:
5050
5151
Args:
5252
state (dict): The current state of the graph. The input keys will be used to fetch the
53-
correct data from the state.
53+
correct data from the state.
5454
5555
Returns:
5656
dict: The updated state with the output key containing the parsed content chunks.
5757
5858
Raises:
59-
KeyError: If the input keys are not found in the state.
59+
KeyError: If the input keys are not found in the state, indicating that the
60+
necessary information for parsing the content is missing.
6061
"""
6162

6263
self.logger.info(f"--- Executing {self.node_name} Node ---")
6364

64-
# Fetch data using input keys
65+
# Interpret input keys based on the provided input expression
6566
input_keys = self.get_input_keys(state)
67+
68+
# Fetching data from the state based on the input keys
6669
input_data = [state[key] for key in input_keys]
70+
# Parse the document
6771
docs_transformed = input_data[0]
68-
69-
# Parse HTML if enabled
7072
if self.parse_html:
7173
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
7274
docs_transformed = docs_transformed[0]
7375

74-
# Get text content
75-
text_content = docs_transformed.page_content if type(docs_transformed) == Document else docs_transformed
76-
77-
# Chunk the text
78-
chunk_size = self.node_config.get("chunk_size", 4096) - 250
79-
chunks = chunk(text=text_content, chunk_size=chunk_size, token_counter=lambda x: len(x.split()), memoize=False)
76+
chunks = chunk(text=docs_transformed.page_content,
77+
chunk_size= self.node_config.get("chunk_size", 4096),
78+
token_counter=lambda x: len(x),
79+
memoize=False)
80+
else:
81+
docs_transformed = docs_transformed[0]
8082

81-
# Update state with chunks
83+
if type(docs_transformed) == Document:
84+
chunks = chunk(text=docs_transformed.page_content,
85+
chunk_size= self.node_config.get("chunk_size", 4096),
86+
token_counter=lambda x: len(x),
87+
memoize=False)
88+
else:
89+
90+
chunks = chunk(text=docs_transformed,
91+
chunk_size= self.node_config.get("chunk_size", 4096),
92+
token_counter=lambda x: len(x),
93+
memoize=False)
94+
8295
state.update({self.output[0]: chunks})
8396

84-
return state
97+
return state

0 commit comments

Comments
 (0)