Merge branch 'main' into pre/beta

f-aguzzi · f-aguzzi · commit 8cb9646a4504 · 2024-10-18T22:16:39.000+02:00
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
@@ -0,0 +1,15 @@
+# These are supported funding model platforms
+
+github: ScrapeGraphAI
+patreon: # Replace with a single Patreon username
+open_collective: 
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+polar: # Replace with a single Polar username
+buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
+thanks_dev: # Replace with a single thanks.dev username
+custom: 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,13 +5,18 @@
 
 * add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
 
-## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14)
 
+## [1.26.6](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6) (2024-10-18)
+
+## [1.26.6-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.5...v1.26.6-beta.1) (2024-10-14)
 
 ### Bug Fixes
 
 * remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918))
 
+* refactoring of gpt2 tokenizer ([44c3f9c](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/44c3f9c98939c44caa86dc582242819a7c6a0f80))
+>>>>>>> main
+
 ## [1.26.5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.4...v1.26.5) (2024-10-13)
 
 
diff --git a/examples/extras/.env.example b/examples/extras/.env.example
@@ -1,3 +1,4 @@
-OPENAI_API_KEY="OPENAI_API_KEY"
-BROWSER_BASE_PROJECT_ID="BROWSER_BASE_PROJECT_ID"
-BROWSER_BASE_API_KEY="BROWSERBASE_API_KEY"
+OPENAI_API_KEY="YOUR_OPENAI_API_KEY"
+BROWSER_BASE_PROJECT_ID="YOUR_BROWSER_BASE_PROJECT_ID"
+BROWSER_BASE_API_KEY="YOUR_BROWSERBASE_API_KEY"
+SCRAPE_DO_API_KEY="YOUR_SCRAPE_DO_API_KEY"
diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -28,8 +28,8 @@
 # ************************************************
 
 smart_scraper_graph = SmartScraperGraph(
-    prompt="List me what does the company do, the name and a contact email.",
-    source="https://scrapegraphai.com/",
+    prompt="Extract me all the articles",
+    source="https://www.wired.com",
     config=graph_config
 )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,6 +3,7 @@ name = "scrapegraphai"
 
 version = "1.27.0b1"
 
+
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
     { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -128,7 +128,8 @@ def _create_llm(self, llm_config: dict) -> object:
             if requests_per_second is not None:
                 with warnings.catch_warnings():
                     warnings.simplefilter("ignore")
-                    llm_params["rate_limiter"] = InMemoryRateLimiter(requests_per_second=requests_per_second)
+                    llm_params["rate_limiter"] = InMemoryRateLimiter(
+                                                                    requests_per_second=requests_per_second)
             if max_retries is not None:
                 llm_params["max_retries"] = max_retries
 
diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py
@@ -59,7 +59,7 @@ def __init__(self, nodes: list, edges: list, entry_point: str,
             # raise a warning if the entry point is not the first node in the list
             warnings.warn(
                 "Careful! The entry point node is different from the first node in the graph.")
-            
+
         self._set_conditional_node_edges()
 
         # Burr configuration
@@ -89,11 +89,9 @@ def _set_conditional_node_edges(self):
         """
         for node in self.nodes:
             if node.node_type == 'conditional_node':
-                # Find outgoing edges from this ConditionalNode
                 outgoing_edges = [(from_node, to_node) for from_node, to_node in self.raw_edges if from_node.node_name == node.node_name]
                 if len(outgoing_edges) != 2:
                     raise ValueError(f"ConditionalNode '{node.node_name}' must have exactly two outgoing edges.")
-                # Assign true_node_name and false_node_name
                 node.true_node_name = outgoing_edges[0][1].node_name
                 try:
                     node.false_node_name = outgoing_edges[1][1].node_name
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
@@ -99,6 +99,7 @@ def _create_graph(self) -> BaseGraph:
                 "schema": self.schema,
             }
         )
+
         prompt_refier_node = PromptRefinerNode(
             input="user_prompt",
             output=["refined_prompt"],
@@ -108,6 +109,7 @@ def _create_graph(self) -> BaseGraph:
                 "schema": self.schema
             }
         )
+
         html_analyzer_node = HtmlAnalyzerNode(
             input="refined_prompt & original_html",
             output=["html_info", "reduced_html"],
@@ -118,6 +120,7 @@ def _create_graph(self) -> BaseGraph:
                 "reduction": self.config.get("reduction", 0)
             }
         )
+
         generate_code_node = GenerateCodeNode(
             input="user_prompt & refined_prompt & html_info & reduced_html & answer",
             output=["generated_code"],
diff --git a/scrapegraphai/graphs/csv_scraper_graph.py b/scrapegraphai/graphs/csv_scraper_graph.py
@@ -59,6 +59,7 @@ def _create_graph(self):
         """
         Creates the graph of nodes representing the workflow for web scraping.
         """
+    
         fetch_node = FetchNode(
             input="csv | csv_dir",
             output=["doc"],
@@ -90,6 +91,7 @@ def run(self) -> str:
         """
         Executes the web scraping process and returns the answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py
@@ -94,6 +94,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "jsons": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/document_scraper_multi_graph.py b/scrapegraphai/graphs/document_scraper_multi_graph.py
@@ -94,6 +94,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "xmls": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py
@@ -95,6 +95,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "jsons": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py
@@ -62,13 +62,15 @@ def _create_graph(self) -> BaseGraph:
         Returns:
             BaseGraph: A graph instance representing the web scraping workflow.
         """
+
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
             node_config={
                 "loader_kwargs": self.config.get("loader_kwargs", {}),
             }
         )
+
         parse_node = ParseNode(
             input="doc & (url | local_dir)",
             output=["parsed_doc", "link_urls", "img_urls"],
@@ -78,6 +80,7 @@ def _create_graph(self) -> BaseGraph:
                 "llm_model": self.llm_model
             }
         )
+
         image_to_text_node = ImageToTextNode(
             input="img_urls",
             output=["img_desc"],
diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py
@@ -59,13 +59,6 @@ def _create_graph(self) -> BaseGraph:
             BaseGraph: A graph instance representing the web scraping and searching workflow.
         """
 
-        # omni_scraper_instance = OmniScraperGraph(
-        #     prompt="",
-        #     source="",
-        #     config=self.copy_config,
-        #     schema=self.copy_schema
-        # )
-
         search_internet_node = SearchInternetNode(
             input="user_prompt",
             output=["urls"],
@@ -115,6 +108,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py
@@ -91,6 +91,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "urls": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
         return self.final_state.get("merged_script", "Failed to generate the script.")
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -110,6 +110,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
@@ -126,4 +127,5 @@ def get_considered_urls(self) -> List[str]:
         Returns:
             List[str]: A list of URLs considered during the search.
         """
+
         return self.considered_urls
diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py
@@ -47,6 +47,7 @@ def _create_graph(self) -> BaseGraph:
         Returns:
             BaseGraph: A graph instance representing the web scraping workflow.
         """
+
         fetch_node = FetchNode(
                 input="url| local_dir",
                 output=["doc"],
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -59,6 +59,7 @@ def _create_graph(self) -> BaseGraph:
         Returns:
             BaseGraph: A graph instance representing the web scraping workflow.
         """
+
         fetch_node = FetchNode(
             input="url| local_dir",
             output=["doc"],
diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
@@ -41,7 +41,7 @@ class SmartScraperMultiConcatGraph(AbstractGraph):
         ... )
         >>> result = search_graph.run()
     """
-    
+
     def __init__(self, prompt: str, source: List[str], 
                  config: dict, schema: Optional[BaseModel] = None):
 
@@ -122,6 +122,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "urls": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py
@@ -96,6 +96,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "urls": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py
@@ -93,6 +93,7 @@ def run(self) -> str:
         Returns:
             str: The answer to the prompt.
         """
+
         inputs = {"user_prompt": self.prompt, "xmls": self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
diff --git a/scrapegraphai/utils/tokenizer.py b/scrapegraphai/utils/tokenizer.py
@@ -6,6 +6,7 @@
 from langchain_ollama import ChatOllama
 from langchain_mistralai import ChatMistralAI
 from langchain_core.language_models.chat_models import BaseChatModel
+from transformers import GPT2TokenizerFast
 
 def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
     """
@@ -23,6 +24,13 @@ def num_tokens_calculus(string: str, llm_model: BaseChatModel) -> int:
         from .tokenizers.tokenizer_ollama import num_tokens_ollama
         num_tokens_fn = num_tokens_ollama
 
+    elif isinstance(llm_model, GPT2TokenizerFast):
+        def num_tokens_gpt2(text: str, model: BaseChatModel) -> int:
+            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+            tokens = tokenizer.encode(text)
+            return len(tokens)
+        num_tokens_fn = num_tokens_gpt2
+
     else:
         from .tokenizers.tokenizer_openai import num_tokens_openai
         num_tokens_fn = num_tokens_openai
diff --git a/scrapegraphai/utils/tokenizers/tokenizer_ollama.py b/scrapegraphai/utils/tokenizers/tokenizer_ollama.py
@@ -3,6 +3,7 @@
 """
 from langchain_core.language_models.chat_models import BaseChatModel
 from ..logging import get_logger
+from transformers import GPT2TokenizerFast
 
 def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
     """
@@ -21,8 +22,12 @@ def num_tokens_ollama(text: str, llm_model:BaseChatModel) -> int:
 
     logger.debug(f"Counting tokens for text of {len(text)} characters")
 
+    if isinstance(llm_model, GPT2TokenizerFast):
+        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        tokens = tokenizer.encode(text)
+        return len(tokens)
+
     # Use langchain token count implementation
     # NB: https://github.com/ollama/ollama/issues/1716#issuecomment-2074265507
     tokens = llm_model.get_num_tokens(text)
     return tokens
-
diff --git a/tests/graphs/smart_scraper_ollama_test.py b/tests/graphs/smart_scraper_ollama_test.py
@@ -3,6 +3,7 @@
 """
 import pytest
 from scrapegraphai.graphs import SmartScraperGraph
+from transformers import GPT2TokenizerFast
 
 
 @pytest.fixture
@@ -50,3 +51,11 @@ def test_get_execution_info(graph_config: dict):
     graph_exec_info = smart_scraper_graph.get_execution_info()
 
     assert graph_exec_info is not None
+
+
+def test_gpt2_tokenizer_loading():
+    """
+    Test loading of GPT2TokenizerFast
+    """
+    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    assert tokenizer is not None

Original file line number	Diff line number	Diff line change
`@@ -28,8 +28,8 @@`
`28`	`28`	`# ************************************************`
`29`	`29`
`30`	`30`	`smart_scraper_graph = SmartScraperGraph(`
`31`		`- prompt="List me what does the company do, the name and a contact email.",`
`32`		`- source="https://scrapegraphai.com/",`
	`31`	`+ prompt="Extract me all the articles",`
	`32`	`+ source="https://www.wired.com",`
`33`	`33`	`config=graph_config`
`34`	`34`	`)`
`35`	`35`
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ def _create_graph(self) -> BaseGraph:`
`99`	`99`	`"schema": self.schema,`
`100`	`100`	`}`
`101`	`101`	`)`
	`102`	`+`
`102`	`103`	`prompt_refier_node = PromptRefinerNode(`
`103`	`104`	`input="user_prompt",`
`104`	`105`	`output=["refined_prompt"],`
`@@ -108,6 +109,7 @@ def _create_graph(self) -> BaseGraph:`
`108`	`109`	`"schema": self.schema`
`109`	`110`	`}`
`110`	`111`	`)`
	`112`	`+`
`111`	`113`	`html_analyzer_node = HtmlAnalyzerNode(`
`112`	`114`	`input="refined_prompt & original_html",`
`113`	`115`	`output=["html_info", "reduced_html"],`
`@@ -118,6 +120,7 @@ def _create_graph(self) -> BaseGraph:`
`118`	`120`	`"reduction": self.config.get("reduction", 0)`
`119`	`121`	`}`
`120`	`122`	`)`
	`123`	`+`
`121`	`124`	`generate_code_node = GenerateCodeNode(`
`122`	`125`	`input="user_prompt & refined_prompt & html_info & reduced_html & answer",`
`123`	`126`	`output=["generated_code"],`
Original file line number	Diff line number	Diff line change
`@@ -62,13 +62,15 @@ def _create_graph(self) -> BaseGraph:`
`62`	`62`	`Returns:`
`63`	`63`	`BaseGraph: A graph instance representing the web scraping workflow.`
`64`	`64`	`"""`
	`65`	`+`
`65`	`66`	`fetch_node = FetchNode(`
`66`	`67`	`input="url \| local_dir",`
`67`	`68`	`output=["doc"],`
`68`	`69`	`node_config={`
`69`	`70`	`"loader_kwargs": self.config.get("loader_kwargs", {}),`
`70`	`71`	`}`
`71`	`72`	`)`
	`73`	`+`
`72`	`74`	`parse_node = ParseNode(`
`73`	`75`	`input="doc & (url \| local_dir)",`
`74`	`76`	`output=["parsed_doc", "link_urls", "img_urls"],`
`@@ -78,6 +80,7 @@ def _create_graph(self) -> BaseGraph:`
`78`	`80`	`"llm_model": self.llm_model`
`79`	`81`	`}`
`80`	`82`	`)`
	`83`	`+`
`81`	`84`	`image_to_text_node = ImageToTextNode(`
`82`	`85`	`input="img_urls",`
`83`	`86`	`output=["img_desc"],`