Merge branch 'pre/beta' into 713-pdf-scrapping

VinciGit00 · web-flow · commit 99ad654c438e · 2024-10-01T10:26:28.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,27 @@
-## [1.25.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.25.1) (2024-09-29)
+
+## [1.26.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.25.0...v1.26.0-beta.1) (2024-09-29)
+
+
+
+* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
+* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
+
 
 
 ### Bug Fixes
 
 * removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
 
+* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
+* removed deep scraper ([9aa8c88](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9aa8c889fb32f2eb2005a2fb04f05dc188092279))
+
+
+### CI
+
+* **release:** 1.22.0-beta.4 [skip ci] ([4330179](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4330179cb65674d65423c1763f90182e85c15a74))
+* **release:** 1.22.0-beta.5 [skip ci] ([6d8f543](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/6d8f5435d1ecd2d90b06aade50abc064f75c9d78))
+* **release:** 1.22.0-beta.6 [skip ci] ([39f7815](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/39f78154a6f1123fa8aca5e169c803111c175473))
+
 ## [1.25.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.1...v1.25.0) (2024-09-27)
 
 
@@ -15,11 +32,15 @@
 ## [1.24.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.24.0...v1.24.1) (2024-09-26)
 
 
+
 ### Bug Fixes
 
 * script creator multi ([9905be8](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9905be8a37dc1ff4b90fe9b8be987887253be8bd))
 
 ## [1.24.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.23.1...v1.24.0) (2024-09-26)
+* integration with html_mode ([f87ffa1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f87ffa1d8db32b38c47d9f5aa2ae88f1d7978a04))
+
+## [1.22.0-beta.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.4...v1.22.0-beta.5) (2024-09-27)
 
 
 ### Features
@@ -44,6 +65,14 @@
 * **release:** 1.22.0-beta.1 [skip ci] ([f42a95f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f42a95faa05de39bd9cfc05e377d4b3da372e482))
 * **release:** 1.22.0-beta.2 [skip ci] ([431c09f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/431c09f551ac28581674c6061f055fde0350ed4c))
 * **release:** 1.22.0-beta.3 [skip ci] ([e5ac020](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e5ac0205d1e04a8b31e86166c3673915b70fd1e3))
+* add reasoning integration ([b2822f6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/b2822f620a610e61d295cbf4b670aa08fde9de24))
+
+## [1.22.0-beta.4](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.3...v1.22.0-beta.4) (2024-09-27)
+
+
+### Features
+
+* add html_mode to smart_scraper ([bdcffd6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/bdcffd6360237b27797546a198ceece55ce4bc81))
 
 ## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25)
 
diff --git a/examples/extras/html_mode.py b/examples/extras/html_mode.py
@@ -0,0 +1,49 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+By default smart scraper converts in md format the 
+code. If you want to just use the original code, you have
+to specify in the confi
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "html_mode": True,
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/examples/extras/reasoning.py b/examples/extras/reasoning.py
@@ -0,0 +1,46 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "reasoning": True,
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,7 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.25.1"
-
+version = "1.26.0b1"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -9,6 +9,7 @@
 from ..nodes import (
     FetchNode,
     ParseNode,
+    ReasoningNode,
     GenerateAnswerNode
 )
 
@@ -69,7 +70,6 @@ def _create_graph(self) -> BaseGraph:
                 "scrape_do": self.config.get("scrape_do")
             }
         )
-
         parse_node = ParseNode(
             input="doc",
             output=["parsed_doc"],
@@ -89,19 +89,87 @@ def _create_graph(self) -> BaseGraph:
             }
         )
 
+        if self.config.get("html_mode") is False:
+            parse_node = ParseNode(
+                input="doc",
+                output=["parsed_doc"],
+                node_config={
+                    "llm_model": self.llm_model,
+                    "chunk_size": self.model_token
+                }
+            )
+
+        if self.config.get("reasoning"):
+            reasoning_node =  ReasoningNode(
+                input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+                output=["answer"],
+                node_config={
+                    "llm_model": self.llm_model,
+                    "additional_info": self.config.get("additional_info"),
+                    "schema": self.schema,
+                }
+            )
+
+        if self.config.get("html_mode") is False and self.config.get("reasoning") is True:
+
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    parse_node,
+                    reasoning_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, parse_node),
+                    (parse_node, reasoning_node),
+                    (reasoning_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
+        elif self.config.get("html_mode") is True and self.config.get("reasoning") is True:
+
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    reasoning_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, reasoning_node),
+                    (reasoning_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
+        elif self.config.get("html_mode") is True and self.config.get("reasoning") is False:
+            return BaseGraph(
+                nodes=[
+                    fetch_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
+
         return BaseGraph(
-            nodes=[
-                fetch_node,
-                parse_node,
-                generate_answer_node,
-            ],
-            edges=[
-                (fetch_node, parse_node),
-                (parse_node, generate_answer_node)
-            ],
-            entry_point=fetch_node,
-            graph_name=self.__class__.__name__
-        )
+                nodes=[
+                    fetch_node,
+                    parse_node,
+                    generate_answer_node,
+                ],
+                edges=[
+                    (fetch_node, parse_node),
+                    (parse_node, generate_answer_node)
+                ],
+                entry_point=fetch_node,
+                graph_name=self.__class__.__name__
+            )
 
     def run(self) -> str:
         """
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -27,3 +27,4 @@
 from .html_analyzer_node import HtmlAnalyzerNode
 from .generate_code_node import GenerateCodeNode
 from .search_node_with_context import SearchLinksWithContext
+from .reasoning_node import ReasoningNode
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
@@ -4,10 +4,7 @@
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from langchain_core.runnables import RunnableParallel
-from langchain_core.utils.pydantic import is_basemodel_subclass
 from langchain_community.chat_models import ChatOllama
-from tqdm import tqdm
 from .base_node import BaseNode
 from ..utils import reduce_html
 from ..prompts import (
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
@@ -0,0 +1,97 @@
+"""
+PromptRefinerNode Module
+"""
+from typing import List, Optional
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.chat_models import ChatOllama
+from .base_node import BaseNode
+from ..utils import transform_schema
+from ..prompts import (
+    TEMPLATE_REASONING, TEMPLATE_REASONING_WITH_CONTEXT
+)
+
+class ReasoningNode(BaseNode):
+    """
+    A node that refine the user prompt with the use of the schema and additional context and
+    create a precise prompt in subsequent steps that explicitly link elements in the user's
+    original input to their corresponding representations in the JSON schema.
+
+    Attributes:
+        llm_model: An instance of a language model client, configured for generating answers.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswer".
+    """
+
+    def __init__(
+        self,
+        input: str,
+        output: List[str],
+        node_config: Optional[dict] = None,
+        node_name: str = "PromptRefiner",
+    ):
+        super().__init__(node_name, "node", input, output, 2, node_config)
+
+        self.llm_model = node_config["llm_model"]
+
+        if isinstance(node_config["llm_model"], ChatOllama):
+            self.llm_model.format="json"
+
+        self.verbose = (
+            True if node_config is None else node_config.get("verbose", False)
+        )
+        self.force = (
+            False if node_config is None else node_config.get("force", False)
+        )
+
+        self.additional_info = node_config.get("additional_info", None)
+
+        self.output_schema = node_config.get("schema")
+
+    def execute(self, state: dict) -> dict:
+        """
+        Generate a refined prompt for the reasoning task based 
+        on the user's input and the JSON schema.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                            to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+
+        Raises:
+            KeyError: If the input keys are not found in the state, indicating
+                      that the necessary information for generating an answer is missing.
+        """
+
+        self.logger.info(f"--- Executing {self.node_name} Node ---")
+
+        user_prompt = state['user_prompt']
+
+        self.simplefied_schema = transform_schema(self.output_schema.schema())
+
+        if self.additional_info is not None:
+            prompt = PromptTemplate(
+                template=TEMPLATE_REASONING_WITH_CONTEXT,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema),
+                                    "additional_context": self.additional_info})
+        else:
+            prompt = PromptTemplate(
+                template=TEMPLATE_REASONING,
+                partial_variables={"user_input": user_prompt,
+                                    "json_schema": str(self.simplefied_schema)})
+
+        output_parser = StrOutputParser()
+
+        chain =  prompt | self.llm_model | output_parser
+        refined_prompt = chain.invoke({})
+
+        state.update({self.output[0]: refined_prompt})
+        return state
diff --git a/scrapegraphai/prompts/__init__.py b/scrapegraphai/prompts/__init__.py
diff --git a/scrapegraphai/prompts/reasoning_node_prompts.py b/scrapegraphai/prompts/reasoning_node_prompts.py