Skip to content

ScrapeGraphAI/580-OmniScraperGraph-fix #622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/anthropic/custom_graph_haiku.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/bedrock/custom_graph_bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/ernie/custom_graph_ernie.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/fireworks/custom_graph_fireworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/groq/custom_graph_groq.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/huggingfacehub/custom_graph_huggingfacehub.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/local_models/custom_graph_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/mistral/custom_graph_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/mixed_models/custom_graph_groq_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/nemotron/custom_graph_nemotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/oneapi/custom_graph_oneapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion examples/openai/custom_graph_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"verbose": True,
"headless": True,
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/deep_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def _create_repeated_graph(self) -> BaseGraph:
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"]
output=["doc"]
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/json_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="json | json_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
)

generate_answer_node = GenerateAnswerNode(
Expand Down
7 changes: 4 additions & 3 deletions scrapegraphai/graphs/omni_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,17 @@ def _create_graph(self) -> BaseGraph:
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
input="doc & (url | local_dir)",
output=["parsed_doc", "link_urls", "img_urls"],
node_config={
"chunk_size": self.model_token,
"parse_urls": True,
"llm_model": self.llm_model
}
)
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/script_creator_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"llm_model": self.llm_model,
"loader_kwargs": self.config.get("loader_kwargs", {}),
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/search_link_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="url| local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _create_graph(self) -> BaseGraph:
"""
fetch_node = FetchNode(
input="url| local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/speech_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="url | local_dir",
output=["doc", "link_urls", "img_urls"]
output=["doc"]
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/xml_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="xml | xml_dir",
output=["doc", "link_urls", "img_urls"]
output=["doc"]
)

generate_answer_node = GenerateAnswerNode(
Expand Down
19 changes: 15 additions & 4 deletions scrapegraphai/nodes/image_to_text_node.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""
ImageToTextNode Module
"""
import traceback
from typing import List, Optional
from ..utils.logging import get_logger
from .base_node import BaseNode
from langchain_core.messages import HumanMessage

class ImageToTextNode(BaseNode):
"""
Expand Down Expand Up @@ -58,16 +60,25 @@ def execute(self, state: dict) -> dict:
if isinstance(urls, str):
urls = [urls]
elif len(urls) == 0:
return state
return state.update({self.output[0]: []})

# Skip the image-to-text conversion
if self.max_images < 1:
return state

return state.update({self.output[0]: []})
img_desc = []
for url in urls[: self.max_images]:
try:
text_answer = self.llm_model.run(url)
message = HumanMessage(
content=[
{"type": "text", "text": "Describe the provided image."},
{
"type": "image_url",
"image_url": {"url": url},
},
]
)
text_answer = self.llm_model.invoke([message]).content
except Exception as e:
text_answer = f"Error: incompatible image format or model failure."
img_desc.append(text_answer)
Expand Down
77 changes: 76 additions & 1 deletion scrapegraphai/nodes/parse_node.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
"""
ParseNode Module
"""
from typing import List, Optional
from typing import Tuple, List, Optional
from urllib.parse import urljoin
from semchunk import chunk
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document
from .base_node import BaseNode
from ..helpers import default_filters

import re

class ParseNode(BaseNode):
"""
Expand Down Expand Up @@ -41,6 +45,66 @@ def __init__(
True if node_config is None else node_config.get("parse_html", True)
)
self.llm_model = node_config['llm_model']
self.parse_urls = (
False if node_config is None else node_config.get("parse_urls", False)
)

def _clean_urls(self, urls: List[str]) -> List[str]:
"""
Cleans the URLs extracted from the text.

Args:
urls (List[str]): The list of URLs to clean.

Returns:
List[str]: The cleaned URLs.
"""
cleaned_urls = []
for url in urls:
# Remove any leading 'thumbnail](' or similar patterns
url = re.sub(r'.*?\]\(', '', url)

# Remove any trailing parentheses or brackets
url = url.rstrip(').')

cleaned_urls.append(url)

return cleaned_urls

def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
"""
Extracts URLs from the given text.

Args:
text (str): The text to extract URLs from.

Returns:
Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
"""
# Return empty lists if the URLs are not to be parsed
if not self.parse_urls:
return [], []

# Regular expression to find URLs (both links and images)
image_extensions = default_filters.filter_dict["img_exts"]
image_extension_seq = '|'.join(image_extensions).replace('.','')
url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')

# Find all URLs in the string
all_urls = url_pattern.findall(text)
all_urls = self._clean_urls(all_urls)

if not source.startswith("http"):
# Remove any URLs that is not complete
all_urls = [url for url in all_urls if url.startswith("http")]
else:
# Add to local URLs the source URL
all_urls = [urljoin(source, url) for url in all_urls]

images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
links = [url for url in all_urls if url not in images]

return links, images

def execute(self, state: dict) -> dict:
"""
Expand All @@ -63,7 +127,9 @@ def execute(self, state: dict) -> dict:
input_keys = self.get_input_keys(state)

input_data = [state[key] for key in input_keys]

docs_transformed = input_data[0]
source = input_data[1] if self.parse_urls else None

def count_tokens(text):
from ..utils import token_count
Expand All @@ -73,12 +139,17 @@ def count_tokens(text):
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
docs_transformed = docs_transformed[0]

link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)

chunks = chunk(text=docs_transformed.page_content,
chunk_size=self.node_config.get("chunk_size", 4096)-250,
token_counter=count_tokens,
memoize=False)
else:
docs_transformed = docs_transformed[0]

link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)

chunk_size = self.node_config.get("chunk_size", 4096)
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))

Expand All @@ -94,4 +165,8 @@ def count_tokens(text):
memoize=False)

state.update({self.output[0]: chunks})
if self.parse_urls:
state.update({self.output[1]: link_urls})
state.update({self.output[2]: img_urls})

return state
2 changes: 1 addition & 1 deletion tests/graphs/abstract_graph_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, prompt: str, config: dict):
def _create_graph(self) -> BaseGraph:
fetch_node = FetchNode(
input="url| local_dir",
output=["doc", "link_urls", "img_urls"],
output=["doc"],
node_config={
"llm_model": self.llm_model,
"force": self.config.get("force", False),
Expand Down