Skip to content

Commit 81af62d

Browse files
authored
Merge pull request #622 from LorenzoPaleari/pre/beta
ScrapeGraphAI/580-OmniScraperGraph-fix
2 parents fc55418 + 57337a0 commit 81af62d

23 files changed

+115
-28
lines changed

examples/anthropic/custom_graph_haiku.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040

4141
fetch_node = FetchNode(
4242
input="url | local_dir",
43-
output=["doc", "link_urls", "img_urls"],
43+
output=["doc"],
4444
node_config={
4545
"verbose": True,
4646
"headless": True,

examples/bedrock/custom_graph_bedrock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555

5656
fetch_node = FetchNode(
5757
input="url | local_dir",
58-
output=["doc", "link_urls", "img_urls"],
58+
output=["doc"],
5959
node_config={
6060
"verbose": True,
6161
"headless": True,

examples/ernie/custom_graph_ernie.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
fetch_node = FetchNode(
4545
input="url | local_dir",
46-
output=["doc", "link_urls", "img_urls"],
46+
output=["doc"],
4747
node_config={
4848
"verbose": True,
4949
"headless": True,

examples/fireworks/custom_graph_fireworks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
fetch_node = FetchNode(
4545
input="url | local_dir",
46-
output=["doc", "link_urls", "img_urls"],
46+
output=["doc"],
4747
node_config={
4848
"verbose": True,
4949
"headless": True,

examples/groq/custom_graph_groq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
fetch_node = FetchNode(
4545
input="url | local_dir",
46-
output=["doc", "link_urls", "img_urls"],
46+
output=["doc"],
4747
node_config={
4848
"verbose": True,
4949
"headless": True,

examples/huggingfacehub/custom_graph_huggingfacehub.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555

5656
fetch_node = FetchNode(
5757
input="url | local_dir",
58-
output=["doc", "link_urls", "img_urls"],
58+
output=["doc"],
5959
node_config={
6060
"verbose": True,
6161
"headless": True,

examples/local_models/custom_graph_ollama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545
fetch_node = FetchNode(
4646
input="url | local_dir",
47-
output=["doc", "link_urls", "img_urls"],
47+
output=["doc"],
4848
node_config={
4949
"verbose": True,
5050
"headless": True,

examples/mistral/custom_graph_mistral.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242

4343
fetch_node = FetchNode(
4444
input="url | local_dir",
45-
output=["doc", "link_urls", "img_urls"],
45+
output=["doc"],
4646
node_config={
4747
"verbose": True,
4848
"headless": True,

examples/mixed_models/custom_graph_groq_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151

5252
fetch_node = FetchNode(
5353
input="url | local_dir",
54-
output=["doc", "link_urls", "img_urls"],
54+
output=["doc"],
5555
node_config={
5656
"verbose": True,
5757
"headless": True,

examples/nemotron/custom_graph_nemotron.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242

4343
fetch_node = FetchNode(
4444
input="url | local_dir",
45-
output=["doc", "link_urls", "img_urls"],
45+
output=["doc"],
4646
node_config={
4747
"verbose": True,
4848
"headless": True,

examples/oneapi/custom_graph_oneapi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838

3939
fetch_node = FetchNode(
4040
input="url | local_dir",
41-
output=["doc", "link_urls", "img_urls"],
41+
output=["doc"],
4242
node_config={
4343
"verbose": True,
4444
"headless": True,

examples/openai/custom_graph_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343

4444
fetch_node = FetchNode(
4545
input="url | local_dir",
46-
output=["doc", "link_urls", "img_urls"],
46+
output=["doc"],
4747
node_config={
4848
"verbose": True,
4949
"headless": True,

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def _create_repeated_graph(self) -> BaseGraph:
6969
"""
7070
fetch_node = FetchNode(
7171
input="url | local_dir",
72-
output=["doc", "link_urls", "img_urls"]
72+
output=["doc"]
7373
)
7474
parse_node = ParseNode(
7575
input="doc",

scrapegraphai/graphs/json_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:
5656

5757
fetch_node = FetchNode(
5858
input="json | json_dir",
59-
output=["doc", "link_urls", "img_urls"],
59+
output=["doc"],
6060
)
6161

6262
generate_answer_node = GenerateAnswerNode(

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,17 @@ def _create_graph(self) -> BaseGraph:
6565
"""
6666
fetch_node = FetchNode(
6767
input="url | local_dir",
68-
output=["doc", "link_urls", "img_urls"],
68+
output=["doc"],
6969
node_config={
7070
"loader_kwargs": self.config.get("loader_kwargs", {}),
7171
}
7272
)
7373
parse_node = ParseNode(
74-
input="doc",
75-
output=["parsed_doc"],
74+
input="doc & (url | local_dir)",
75+
output=["parsed_doc", "link_urls", "img_urls"],
7676
node_config={
7777
"chunk_size": self.model_token,
78+
"parse_urls": True,
7879
"llm_model": self.llm_model
7980
}
8081
)

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:
6262

6363
fetch_node = FetchNode(
6464
input="url | local_dir",
65-
output=["doc", "link_urls", "img_urls"],
65+
output=["doc"],
6666
node_config={
6767
"llm_model": self.llm_model,
6868
"loader_kwargs": self.config.get("loader_kwargs", {}),

scrapegraphai/graphs/search_link_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def _create_graph(self) -> BaseGraph:
5252

5353
fetch_node = FetchNode(
5454
input="url| local_dir",
55-
output=["doc", "link_urls", "img_urls"],
55+
output=["doc"],
5656
node_config={
5757
"llm_model": self.llm_model,
5858
"force": self.config.get("force", False),

scrapegraphai/graphs/smart_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def _create_graph(self) -> BaseGraph:
6161
"""
6262
fetch_node = FetchNode(
6363
input="url| local_dir",
64-
output=["doc", "link_urls", "img_urls"],
64+
output=["doc"],
6565
node_config={
6666
"llm_model": self.llm_model,
6767
"force": self.config.get("force", False),

scrapegraphai/graphs/speech_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:
6262

6363
fetch_node = FetchNode(
6464
input="url | local_dir",
65-
output=["doc", "link_urls", "img_urls"]
65+
output=["doc"]
6666
)
6767
parse_node = ParseNode(
6868
input="doc",

scrapegraphai/graphs/xml_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def _create_graph(self) -> BaseGraph:
6060

6161
fetch_node = FetchNode(
6262
input="xml | xml_dir",
63-
output=["doc", "link_urls", "img_urls"]
63+
output=["doc"]
6464
)
6565

6666
generate_answer_node = GenerateAnswerNode(

scrapegraphai/nodes/image_to_text_node.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""
22
ImageToTextNode Module
33
"""
4+
import traceback
45
from typing import List, Optional
56
from ..utils.logging import get_logger
67
from .base_node import BaseNode
8+
from langchain_core.messages import HumanMessage
79

810
class ImageToTextNode(BaseNode):
911
"""
@@ -58,16 +60,25 @@ def execute(self, state: dict) -> dict:
5860
if isinstance(urls, str):
5961
urls = [urls]
6062
elif len(urls) == 0:
61-
return state
63+
return state.update({self.output[0]: []})
6264

6365
# Skip the image-to-text conversion
6466
if self.max_images < 1:
65-
return state
66-
67+
return state.update({self.output[0]: []})
68+
6769
img_desc = []
6870
for url in urls[: self.max_images]:
6971
try:
70-
text_answer = self.llm_model.run(url)
72+
message = HumanMessage(
73+
content=[
74+
{"type": "text", "text": "Describe the provided image."},
75+
{
76+
"type": "image_url",
77+
"image_url": {"url": url},
78+
},
79+
]
80+
)
81+
text_answer = self.llm_model.invoke([message]).content
7182
except Exception as e:
7283
text_answer = f"Error: incompatible image format or model failure."
7384
img_desc.append(text_answer)

scrapegraphai/nodes/parse_node.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
"""
22
ParseNode Module
33
"""
4-
from typing import List, Optional
4+
from typing import Tuple, List, Optional
5+
from urllib.parse import urljoin
56
from semchunk import chunk
67
from langchain_community.document_transformers import Html2TextTransformer
78
from langchain_core.documents import Document
89
from .base_node import BaseNode
10+
from ..helpers import default_filters
11+
12+
import re
913

1014
class ParseNode(BaseNode):
1115
"""
@@ -41,6 +45,66 @@ def __init__(
4145
True if node_config is None else node_config.get("parse_html", True)
4246
)
4347
self.llm_model = node_config['llm_model']
48+
self.parse_urls = (
49+
False if node_config is None else node_config.get("parse_urls", False)
50+
)
51+
52+
def _clean_urls(self, urls: List[str]) -> List[str]:
53+
"""
54+
Cleans the URLs extracted from the text.
55+
56+
Args:
57+
urls (List[str]): The list of URLs to clean.
58+
59+
Returns:
60+
List[str]: The cleaned URLs.
61+
"""
62+
cleaned_urls = []
63+
for url in urls:
64+
# Remove any leading 'thumbnail](' or similar patterns
65+
url = re.sub(r'.*?\]\(', '', url)
66+
67+
# Remove any trailing parentheses or brackets
68+
url = url.rstrip(').')
69+
70+
cleaned_urls.append(url)
71+
72+
return cleaned_urls
73+
74+
def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
75+
"""
76+
Extracts URLs from the given text.
77+
78+
Args:
79+
text (str): The text to extract URLs from.
80+
81+
Returns:
82+
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83+
"""
84+
# Return empty lists if the URLs are not to be parsed
85+
if not self.parse_urls:
86+
return [], []
87+
88+
# Regular expression to find URLs (both links and images)
89+
image_extensions = default_filters.filter_dict["img_exts"]
90+
image_extension_seq = '|'.join(image_extensions).replace('.','')
91+
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
92+
93+
# Find all URLs in the string
94+
all_urls = url_pattern.findall(text)
95+
all_urls = self._clean_urls(all_urls)
96+
97+
if not source.startswith("http"):
98+
# Remove any URLs that is not complete
99+
all_urls = [url for url in all_urls if url.startswith("http")]
100+
else:
101+
# Add to local URLs the source URL
102+
all_urls = [urljoin(source, url) for url in all_urls]
103+
104+
images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
105+
links = [url for url in all_urls if url not in images]
106+
107+
return links, images
44108

45109
def execute(self, state: dict) -> dict:
46110
"""
@@ -63,7 +127,9 @@ def execute(self, state: dict) -> dict:
63127
input_keys = self.get_input_keys(state)
64128

65129
input_data = [state[key] for key in input_keys]
130+
66131
docs_transformed = input_data[0]
132+
source = input_data[1] if self.parse_urls else None
67133

68134
def count_tokens(text):
69135
from ..utils import token_count
@@ -73,12 +139,17 @@ def count_tokens(text):
73139
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
74140
docs_transformed = docs_transformed[0]
75141

142+
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
143+
76144
chunks = chunk(text=docs_transformed.page_content,
77145
chunk_size=self.node_config.get("chunk_size", 4096)-250,
78146
token_counter=count_tokens,
79147
memoize=False)
80148
else:
81149
docs_transformed = docs_transformed[0]
150+
151+
link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
152+
82153
chunk_size = self.node_config.get("chunk_size", 4096)
83154
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
84155

@@ -94,4 +165,8 @@ def count_tokens(text):
94165
memoize=False)
95166

96167
state.update({self.output[0]: chunks})
168+
if self.parse_urls:
169+
state.update({self.output[1]: link_urls})
170+
state.update({self.output[2]: img_urls})
171+
97172
return state

tests/graphs/abstract_graph_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def __init__(self, prompt: str, config: dict):
2222
def _create_graph(self) -> BaseGraph:
2323
fetch_node = FetchNode(
2424
input="url| local_dir",
25-
output=["doc", "link_urls", "img_urls"],
25+
output=["doc"],
2626
node_config={
2727
"llm_model": self.llm_model,
2828
"force": self.config.get("force", False),

0 commit comments

Comments
 (0)