Merge pull request #622 from LorenzoPaleari/pre/beta

VinciGit00 · web-flow · commit 81af62d35f0e · 2024-09-02T15:13:07.000+02:00
ScrapeGraphAI/580-OmniScraperGraph-fix
diff --git a/examples/anthropic/custom_graph_haiku.py b/examples/anthropic/custom_graph_haiku.py
@@ -40,7 +40,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/bedrock/custom_graph_bedrock.py b/examples/bedrock/custom_graph_bedrock.py
@@ -55,7 +55,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/ernie/custom_graph_ernie.py b/examples/ernie/custom_graph_ernie.py
@@ -43,7 +43,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/fireworks/custom_graph_fireworks.py b/examples/fireworks/custom_graph_fireworks.py
@@ -43,7 +43,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/groq/custom_graph_groq.py b/examples/groq/custom_graph_groq.py
@@ -43,7 +43,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/huggingfacehub/custom_graph_huggingfacehub.py b/examples/huggingfacehub/custom_graph_huggingfacehub.py
@@ -55,7 +55,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/local_models/custom_graph_ollama.py b/examples/local_models/custom_graph_ollama.py
@@ -44,7 +44,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/mistral/custom_graph_mistral.py b/examples/mistral/custom_graph_mistral.py
@@ -42,7 +42,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/mixed_models/custom_graph_groq_openai.py b/examples/mixed_models/custom_graph_groq_openai.py
@@ -51,7 +51,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/nemotron/custom_graph_nemotron.py b/examples/nemotron/custom_graph_nemotron.py
@@ -42,7 +42,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/oneapi/custom_graph_oneapi.py b/examples/oneapi/custom_graph_oneapi.py
@@ -38,7 +38,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/examples/openai/custom_graph_openai.py b/examples/openai/custom_graph_openai.py
@@ -43,7 +43,7 @@
 
 fetch_node = FetchNode(
     input="url | local_dir",
-    output=["doc", "link_urls", "img_urls"],
+    output=["doc"],
     node_config={
         "verbose": True,
         "headless": True,
diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py
@@ -69,7 +69,7 @@ def _create_repeated_graph(self) -> BaseGraph:
         """
         fetch_node = FetchNode(
             input="url | local_dir",
-            output=["doc", "link_urls", "img_urls"]
+            output=["doc"]
         )
         parse_node = ParseNode(
             input="doc",
diff --git a/scrapegraphai/graphs/json_scraper_graph.py b/scrapegraphai/graphs/json_scraper_graph.py
@@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:
 
         fetch_node = FetchNode(
             input="json | json_dir",
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
         )
 
         generate_answer_node = GenerateAnswerNode(
diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py
@@ -65,16 +65,17 @@ def _create_graph(self) -> BaseGraph:
         """
         fetch_node = FetchNode(
             input="url | local_dir",
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
             node_config={
                 "loader_kwargs": self.config.get("loader_kwargs", {}),
             }
         )
         parse_node = ParseNode(
-            input="doc",
-            output=["parsed_doc"],
+            input="doc & (url | local_dir)",
+            output=["parsed_doc", "link_urls", "img_urls"],
             node_config={
                 "chunk_size": self.model_token,
+                "parse_urls": True,
                 "llm_model": self.llm_model
             }
         )
diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py
@@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:
 
         fetch_node = FetchNode(
             input="url | local_dir",
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
             node_config={
                 "llm_model": self.llm_model,
                 "loader_kwargs": self.config.get("loader_kwargs", {}),
diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py
@@ -52,7 +52,7 @@ def _create_graph(self) -> BaseGraph:
 
         fetch_node = FetchNode(
             input="url| local_dir",
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
             node_config={
                 "llm_model": self.llm_model,
                 "force": self.config.get("force", False),
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -61,7 +61,7 @@ def _create_graph(self) -> BaseGraph:
         """
         fetch_node = FetchNode(
             input="url| local_dir",
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
             node_config={
                 "llm_model": self.llm_model,
                 "force": self.config.get("force", False),
diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py
@@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:
 
         fetch_node = FetchNode(
             input="url | local_dir",
-            output=["doc", "link_urls", "img_urls"]
+            output=["doc"]
         )
         parse_node = ParseNode(
             input="doc",
diff --git a/scrapegraphai/graphs/xml_scraper_graph.py b/scrapegraphai/graphs/xml_scraper_graph.py
@@ -60,7 +60,7 @@ def _create_graph(self) -> BaseGraph:
 
         fetch_node = FetchNode(
             input="xml | xml_dir",
-            output=["doc", "link_urls", "img_urls"]
+            output=["doc"]
         )
 
         generate_answer_node = GenerateAnswerNode(
diff --git a/scrapegraphai/nodes/image_to_text_node.py b/scrapegraphai/nodes/image_to_text_node.py
@@ -1,9 +1,11 @@
 """
 ImageToTextNode Module
 """
+import traceback
 from typing import List, Optional
 from ..utils.logging import get_logger
 from .base_node import BaseNode
+from langchain_core.messages import HumanMessage
 
 class ImageToTextNode(BaseNode):
     """
@@ -58,16 +60,25 @@ def execute(self, state: dict) -> dict:
         if isinstance(urls, str):
             urls = [urls]
         elif len(urls) == 0:
-            return state
+            return state.update({self.output[0]: []})
 
         # Skip the image-to-text conversion
         if self.max_images < 1:
-            return state
-
+            return state.update({self.output[0]: []})
+        
         img_desc = []
         for url in urls[: self.max_images]:
             try:
-                text_answer = self.llm_model.run(url)
+                message = HumanMessage(
+                    content=[
+                        {"type": "text", "text": "Describe the provided image."},
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": url},
+                        },
+                    ]
+                )
+                text_answer = self.llm_model.invoke([message]).content
             except Exception as e:
                 text_answer = f"Error: incompatible image format or model failure."
             img_desc.append(text_answer)
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -1,11 +1,15 @@
 """
 ParseNode Module
 """
-from typing import List, Optional
+from typing import Tuple, List, Optional
+from urllib.parse import urljoin
 from semchunk import chunk
 from langchain_community.document_transformers import Html2TextTransformer
 from langchain_core.documents import Document
 from .base_node import BaseNode
+from ..helpers import default_filters
+
+import re
 
 class ParseNode(BaseNode):
     """
@@ -41,6 +45,66 @@ def __init__(
             True if node_config is None else node_config.get("parse_html", True)
         )
         self.llm_model = node_config['llm_model']
+        self.parse_urls = (
+            False if node_config is None else node_config.get("parse_urls", False)
+        )
+
+    def _clean_urls(self, urls: List[str]) -> List[str]:
+        """
+        Cleans the URLs extracted from the text.
+
+        Args:
+            urls (List[str]): The list of URLs to clean.
+
+        Returns:
+            List[str]: The cleaned URLs.
+        """
+        cleaned_urls = []
+        for url in urls:
+            # Remove any leading 'thumbnail](' or similar patterns
+            url = re.sub(r'.*?\]\(', '', url)
+            
+            # Remove any trailing parentheses or brackets
+            url = url.rstrip(').')
+            
+            cleaned_urls.append(url)
+        
+        return cleaned_urls
+
+    def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
+        """
+        Extracts URLs from the given text.
+
+        Args:
+            text (str): The text to extract URLs from.
+
+        Returns:
+            Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
+        """
+        # Return empty lists if the URLs are not to be parsed
+        if not self.parse_urls:
+            return [], []
+        
+        # Regular expression to find URLs (both links and images)
+        image_extensions = default_filters.filter_dict["img_exts"]
+        image_extension_seq = '|'.join(image_extensions).replace('.','')
+        url_pattern = re.compile(r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))')
+
+        # Find all URLs in the string
+        all_urls = url_pattern.findall(text)
+        all_urls = self._clean_urls(all_urls)
+
+        if not source.startswith("http"):
+            # Remove any URLs that is not complete
+            all_urls = [url for url in all_urls if url.startswith("http")]
+        else:
+            # Add to local URLs the source URL
+            all_urls = [urljoin(source, url) for url in all_urls]
+        
+        images = [url for url in all_urls if any(url.endswith(ext) for ext in image_extensions)]
+        links = [url for url in all_urls if url not in images]
+
+        return links, images
 
     def execute(self, state: dict) -> dict:
         """
@@ -63,7 +127,9 @@ def execute(self, state: dict) -> dict:
         input_keys = self.get_input_keys(state)
 
         input_data = [state[key] for key in input_keys]
+
         docs_transformed = input_data[0]
+        source = input_data[1] if self.parse_urls else None
 
         def count_tokens(text):
             from ..utils import token_count
@@ -73,12 +139,17 @@ def count_tokens(text):
             docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
+            link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
+
             chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=self.node_config.get("chunk_size", 4096)-250,
                             token_counter=count_tokens,
                             memoize=False)
         else:
             docs_transformed = docs_transformed[0]
+
+            link_urls, img_urls = self.extract_urls(docs_transformed.page_content, source)
+
             chunk_size = self.node_config.get("chunk_size", 4096)
             chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
 
@@ -94,4 +165,8 @@ def count_tokens(text):
                                 memoize=False)
 
         state.update({self.output[0]: chunks})
+        if self.parse_urls:
+            state.update({self.output[1]: link_urls})
+            state.update({self.output[2]: img_urls})
+
         return state
diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py
@@ -22,7 +22,7 @@ def __init__(self, prompt: str, config: dict):
     def _create_graph(self) -> BaseGraph:
         fetch_node = FetchNode(
             input="url| local_dir",
-            output=["doc", "link_urls", "img_urls"],
+            output=["doc"],
             node_config={
                 "llm_model": self.llm_model,
                 "force": self.config.get("force", False),

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ def _create_repeated_graph(self) -> BaseGraph:`
`69`	`69`	`"""`
`70`	`70`	`fetch_node = FetchNode(`
`71`	`71`	`input="url \| local_dir",`
`72`		`- output=["doc", "link_urls", "img_urls"]`
	`72`	`+ output=["doc"]`
`73`	`73`	`)`
`74`	`74`	`parse_node = ParseNode(`
`75`	`75`	`input="doc",`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:`
`56`	`56`
`57`	`57`	`fetch_node = FetchNode(`
`58`	`58`	`input="json \| json_dir",`
`59`		`- output=["doc", "link_urls", "img_urls"],`
	`59`	`+ output=["doc"],`
`60`	`60`	`)`
`61`	`61`
`62`	`62`	`generate_answer_node = GenerateAnswerNode(`
Original file line number	Diff line number	Diff line change
`@@ -65,16 +65,17 @@ def _create_graph(self) -> BaseGraph:`
`65`	`65`	`"""`
`66`	`66`	`fetch_node = FetchNode(`
`67`	`67`	`input="url \| local_dir",`
`68`		`- output=["doc", "link_urls", "img_urls"],`
	`68`	`+ output=["doc"],`
`69`	`69`	`node_config={`
`70`	`70`	`"loader_kwargs": self.config.get("loader_kwargs", {}),`
`71`	`71`	`}`
`72`	`72`	`)`
`73`	`73`	`parse_node = ParseNode(`
`74`		`- input="doc",`
`75`		`- output=["parsed_doc"],`
	`74`	`+ input="doc & (url \| local_dir)",`
	`75`	`+ output=["parsed_doc", "link_urls", "img_urls"],`
`76`	`76`	`node_config={`
`77`	`77`	`"chunk_size": self.model_token,`
	`78`	`+ "parse_urls": True,`
`78`	`79`	`"llm_model": self.llm_model`
`79`	`80`	`}`
`80`	`81`	`)`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ def _create_graph(self) -> BaseGraph:`
`62`	`62`
`63`	`63`	`fetch_node = FetchNode(`
`64`	`64`	`input="url \| local_dir",`
`65`		`- output=["doc", "link_urls", "img_urls"]`
	`65`	`+ output=["doc"]`
`66`	`66`	`)`
`67`	`67`	`parse_node = ParseNode(`
`68`	`68`	`input="doc",`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ def _create_graph(self) -> BaseGraph:`
`60`	`60`
`61`	`61`	`fetch_node = FetchNode(`
`62`	`62`	`input="xml \| xml_dir",`
`63`		`- output=["doc", "link_urls", "img_urls"]`
	`63`	`+ output=["doc"]`
`64`	`64`	`)`
`65`	`65`
`66`	`66`	`generate_answer_node = GenerateAnswerNode(`