Skip to content

Commit 9355507

Browse files
committed
feat: refactoring of the code
1 parent 3e07f62 commit 9355507

25 files changed

+65
-109
lines changed

scrapegraphai/nodes/base_node.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def update_config(self, params: dict, overwrite: bool = False):
8686
8787
Args:
8888
param (dict): The dictionary to update node_config with.
89-
overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None.
89+
overwrite (bool): Flag indicating if the values of node_config
90+
should be overwritten if their value is not None.
9091
"""
9192
for key, val in params.items():
9293
if hasattr(self, key) and not overwrite:
@@ -133,7 +134,8 @@ def _validate_input_keys(self, input_keys):
133134

134135
def _parse_input_keys(self, state: dict, expression: str) -> List[str]:
135136
"""
136-
Parses the input keys expression to extract relevant keys from the state based on logical conditions.
137+
Parses the input keys expression to extract
138+
relevant keys from the state based on logical conditions.
137139
The expression can contain AND (&), OR (|), and parentheses to group conditions.
138140
139141
Args:

scrapegraphai/nodes/fetch_node.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def execute(self, state):
133133
state.update({self.output[0]: compressed_document})
134134
return state
135135
elif input_keys[0] == "json":
136-
f = open(source)
136+
f = open(source, encoding="utf-8")
137137
compressed_document = [
138138
Document(page_content=str(json.load(f)), metadata={"source": "json"})
139139
]
@@ -181,12 +181,11 @@ def execute(self, state):
181181
if not response.text.strip():
182182
raise ValueError("No HTML body content found in the response.")
183183

184-
parsed_content = response
185-
186184
if not self.cut:
187185
parsed_content = cleanup_html(response, source)
188186

189-
if (isinstance(self.llm_model, ChatOpenAI) and not self.script_creator) or (self.force and not self.script_creator):
187+
if (isinstance(self.llm_model, ChatOpenAI)
188+
and not self.script_creator) or (self.force and not self.script_creator):
190189
parsed_content = convert_to_md(source, input_data[0])
191190
compressed_document = [Document(page_content=parsed_content)]
192191
else:
@@ -205,7 +204,8 @@ def execute(self, state):
205204
data = browser_base_fetch(self.browser_base.get("api_key"),
206205
self.browser_base.get("project_id"), [source])
207206

208-
document = [Document(page_content=content, metadata={"source": source}) for content in data]
207+
document = [Document(page_content=content,
208+
metadata={"source": source}) for content in data]
209209
else:
210210
loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
211211
document = loader.load()
@@ -215,10 +215,8 @@ def execute(self, state):
215215
parsed_content = document[0].page_content
216216

217217
if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator and not self.openai_md_enabled:
218-
219218
parsed_content = convert_to_md(document[0].page_content, input_data[0])
220219

221-
222220
compressed_document = [
223221
Document(page_content=parsed_content, metadata={"source": "html file"})
224222
]

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,12 @@
33
Module for generating the answer node
44
"""
55

6-
# Imports from standard library
76
from typing import List, Optional
8-
9-
# Imports from Langchain
107
from langchain.prompts import PromptTemplate
118
from langchain_core.output_parsers import JsonOutputParser
129
from langchain_core.runnables import RunnableParallel
1310
from tqdm import tqdm
14-
1511
from ..utils.logging import get_logger
16-
17-
# Imports from the library
1812
from .base_node import BaseNode
1913
from ..helpers.generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
2014

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
"""
22
GenerateAnswerNode Module
33
"""
4-
import asyncio
54
from typing import List, Optional
65
from langchain.prompts import PromptTemplate
76
from langchain_core.output_parsers import JsonOutputParser
87
from langchain_core.runnables import RunnableParallel
98
from langchain_openai import ChatOpenAI
109
from langchain_community.chat_models import ChatOllama
1110
from tqdm import tqdm
12-
from langchain_openai import ChatOpenAI
1311
from ..utils.logging import get_logger
1412
from .base_node import BaseNode
1513
from ..helpers import template_chunks, template_no_chunks, template_merge, template_chunks_md, template_no_chunks_md, template_merge_md
@@ -130,7 +128,6 @@ def execute(self, state: dict) -> dict:
130128
partial_variables={"context": chunk,
131129
"chunk_id": i + 1,
132130
"format_instructions": format_instructions})
133-
# Add chain to dictionary with dynamic name
134131
chain_name = f"chunk{i+1}"
135132
chains_dict[chain_name] = prompt | self.llm_model | output_parser
136133

scrapegraphai/nodes/generate_answer_omni_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def execute(self, state: dict) -> dict:
113113

114114
chain = prompt | self.llm_model | output_parser
115115
answer = chain.invoke({"question": user_prompt})
116-
116+
117117
state.update({self.output[0]: answer})
118118
return state
119119

@@ -148,4 +148,4 @@ def execute(self, state: dict) -> dict:
148148
answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
149149

150150
state.update({self.output[0]: answer})
151-
return state
151+
return state

scrapegraphai/nodes/generate_answer_pdf_node.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,13 @@
22
Module for generating the answer node
33
"""
44

5-
# Imports from standard library
65
from typing import List, Optional
7-
8-
# Imports from Langchain
96
from langchain.prompts import PromptTemplate
107
from langchain_core.output_parsers import JsonOutputParser
118
from langchain_core.runnables import RunnableParallel
129
from tqdm import tqdm
1310
from langchain_community.chat_models import ChatOllama
1411
from ..utils.logging import get_logger
15-
16-
# Imports from the library
1712
from .base_node import BaseNode
1813
from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf
1914

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ def execute(self, state: dict) -> dict:
8383
user_prompt = input_data[0]
8484
doc = input_data[1]
8585

86-
# schema to be used for output parsing
8786
if self.node_config.get("schema", None) is not None:
8887
output_schema = JsonOutputParser(pydantic_object=self.node_config["schema"])
8988
else:
@@ -130,7 +129,6 @@ def execute(self, state: dict) -> dict:
130129
)
131130
map_chain = prompt | self.llm_model | StrOutputParser()
132131

133-
# Chain
134132
answer = map_chain.invoke({"question": user_prompt})
135133

136134
state.update({self.output[0]: answer})

scrapegraphai/nodes/get_probable_tags_node.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
GetProbableTagsNode Module
33
"""
4-
54
from typing import List, Optional
65
from langchain.output_parsers import CommaSeparatedListOutputParser
76
from langchain.prompts import PromptTemplate

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,11 @@
55
import asyncio
66
import copy
77
from typing import List, Optional
8-
98
from tqdm.asyncio import tqdm
10-
119
from ..utils.logging import get_logger
1210
from .base_node import BaseNode
1311

14-
_default_batchsize = 16
12+
DEFAULT_BATCHSIZE = 16
1513

1614

1715
class GraphIteratorNode(BaseNode):
@@ -51,13 +49,15 @@ def execute(self, state: dict) -> dict:
5149
the correct data from the state.
5250
5351
Returns:
54-
dict: The updated state with the output key containing the results of the graph instances.
52+
dict: The updated state with the output key c
53+
ontaining the results of the graph instances.
5554
5655
Raises:
57-
KeyError: If the input keys are not found in the state, indicating that the
58-
necessary information for running the graph instances is missing.
56+
KeyError: If the input keys are not found in the state,
57+
indicating that thenecessary information for running
58+
the graph instances is missing.
5959
"""
60-
batchsize = self.node_config.get("batchsize", _default_batchsize)
60+
batchsize = self.node_config.get("batchsize", DEFAULT_BATCHSIZE)
6161

6262
self.logger.info(
6363
f"--- Executing {self.node_name} Node with batchsize {batchsize} ---"

scrapegraphai/nodes/image_to_text_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
"""
44

55
from typing import List, Optional
6-
76
from ..utils.logging import get_logger
87
from .base_node import BaseNode
98

109

1110
class ImageToTextNode(BaseNode):
1211
"""
13-
Retrieve images from a list of URLs and return a description of the images using an image-to-text model.
12+
Retrieve images from a list of URLs and return a description of
13+
the images using an image-to-text model.
1414
1515
Attributes:
1616
llm_model: An instance of the language model client used for image-to-text conversion.

scrapegraphai/nodes/merge_answers_node.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,10 @@
22
MergeAnswersNode Module
33
"""
44

5-
# Imports from standard library
65
from typing import List, Optional
7-
from tqdm import tqdm
8-
9-
# Imports from Langchain
106
from langchain.prompts import PromptTemplate
117
from langchain_core.output_parsers import JsonOutputParser
12-
from tqdm import tqdm
13-
148
from ..utils.logging import get_logger
15-
16-
# Imports from the library
179
from .base_node import BaseNode
1810

1911

scrapegraphai/nodes/merge_generated_scripts.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,9 @@
55
# Imports from standard library
66
from typing import List, Optional
77
from tqdm import tqdm
8-
9-
# Imports from Langchain
108
from langchain.prompts import PromptTemplate
119
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
12-
from tqdm import tqdm
13-
1410
from ..utils.logging import get_logger
15-
16-
# Imports from the library
1711
from .base_node import BaseNode
1812

1913

scrapegraphai/nodes/parse_node.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,23 +75,23 @@ def execute(self, state: dict) -> dict:
7575

7676
chunks = chunk(text=docs_transformed.page_content,
7777
chunk_size= self.node_config.get("chunk_size", 4096)-250,
78-
token_counter=lambda x: len(x),
78+
token_counter= lambda x: len(x),
7979
memoize=False)
8080
else:
8181
docs_transformed = docs_transformed[0]
8282

83-
if type(docs_transformed) == Document:
83+
if isinstance(docs_transformed, Document):
8484
chunks = chunk(text=docs_transformed.page_content,
8585
chunk_size= self.node_config.get("chunk_size", 4096)-250,
86-
token_counter=lambda x: len(x),
86+
token_counter= lambda x: len(x),
8787
memoize=False)
8888
else:
89-
89+
9090
chunks = chunk(text=docs_transformed,
9191
chunk_size= self.node_config.get("chunk_size", 4096)-250,
92-
token_counter=lambda x: len(x),
92+
token_counter= lambda x: len(x),
9393
memoize=False)
94-
94+
9595
state.update({self.output[0]: chunks})
9696

9797
return state

scrapegraphai/nodes/robots_node.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,9 @@
44

55
from typing import List, Optional
66
from urllib.parse import urlparse
7-
87
from langchain_community.document_loaders import AsyncChromiumLoader
98
from langchain.prompts import PromptTemplate
109
from langchain.output_parsers import CommaSeparatedListOutputParser
11-
12-
from langchain.output_parsers import CommaSeparatedListOutputParser
13-
from langchain.prompts import PromptTemplate
14-
from langchain_community.document_loaders import AsyncChromiumLoader
15-
1610
from ..helpers import robots_dictionary
1711
from ..utils.logging import get_logger
1812
from .base_node import BaseNode
@@ -146,4 +140,4 @@ def execute(self, state: dict) -> dict:
146140
self.logger.warning("\033[32m(Scraping this website is allowed)\033[0m")
147141

148142
state.update({self.output[0]: is_scrapable})
149-
return state
143+
return state

scrapegraphai/nodes/search_internet_node.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
"""
22
SearchInternetNode Module
33
"""
4-
54
from typing import List, Optional
6-
75
from langchain.output_parsers import CommaSeparatedListOutputParser
86
from langchain.prompts import PromptTemplate
97
from langchain_community.chat_models import ChatOllama

scrapegraphai/nodes/search_link_node.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,13 @@
22
SearchLinkNode Module
33
"""
44

5-
# Imports from standard library
65
from typing import List, Optional
76
import re
87
from tqdm import tqdm
9-
10-
# Imports from Langchain
118
from langchain.prompts import PromptTemplate
129
from langchain_core.output_parsers import JsonOutputParser
1310
from langchain_core.runnables import RunnableParallel
14-
1511
from ..utils.logging import get_logger
16-
17-
# Imports from the library
1812
from .base_node import BaseNode
1913

2014

scrapegraphai/nodes/search_node_with_context.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ def execute(self, state: dict) -> dict:
6767
# Fetching data from the state based on the input keys
6868
input_data = [state[key] for key in input_keys]
6969

70-
user_prompt = input_data[0]
7170
doc = input_data[1]
7271

7372
output_parser = CommaSeparatedListOutputParser()

scrapegraphai/nodes/text_to_speech_node.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
"""
22
TextToSpeechNode Module
33
"""
4-
54
from typing import List, Optional
6-
75
from ..utils.logging import get_logger
86
from .base_node import BaseNode
97

10-
118
class TextToSpeechNode(BaseNode):
129
"""
1310
Converts text to speech using the specified text-to-speech model.

scrapegraphai/utils/convert_to_md.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""
22
convert_to_md modul
33
"""
4-
import html2text
54
from urllib.parse import urlparse
5+
import html2text
66

77
def convert_to_md(html: str, url: str = None) -> str:
88
""" Convert HTML to Markdown.

0 commit comments

Comments
 (0)