Merge pull request #573 from ScrapeGraphAI/ligthweigthing_library

VinciGit00 · web-flow · commit e0a5e731cf8d · 2024-08-23T12:22:45.000+02:00
Ligthweigthing library
diff --git a/docs/README.md b/docs/README.md
@@ -9,12 +9,6 @@ markmap:
 
 ## **Short-Term Goals**
 
-- Integration with more llm APIs
-
-- Test proxy rotation implementation
-
-- Add more search engines inside the SearchInternetNode
-
 - Improve the documentation (ReadTheDocs)
     - [Issue #102](https://github.com/VinciGit00/Scrapegraph-ai/issues/102)
 
@@ -23,9 +17,6 @@ markmap:
 ## **Medium-Term Goals**
 
 - Node for handling API requests
-
-- Improve SearchGraph to look into the first 5 results of the search engine
-
 - Make scraping more deterministic
     - Create DOM tree of the website
     - HTML tag text embeddings with tags metadata
@@ -70,6 +61,4 @@ markmap:
 
 - Automatic generation of scraping pipelines from a given prompt
 
-- Create API for the library
-
-- Finetune a LLM for html content
+- Create API for the library
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,7 @@
 [project]
 name = "scrapegraphai"
-
-
 version = "1.14.1b1"
-
-
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
-
 authors = [
     { name = "Marco Vinciguerra", email = "mvincig11@gmail.com" },
     { name = "Marco Perini", email = "perinim.98@gmail.com" },
@@ -15,32 +10,24 @@ authors = [
 
 dependencies = [
     "langchain>=0.2.14",
-    "langchain-fireworks>=0.1.3",
-    "langchain_community>=0.2.9",
     "langchain-google-genai>=1.0.7",
-    "langchain-google-vertexai>=1.0.7",
     "langchain-openai>=0.1.22",
-    "langchain-groq>=0.1.3",
-    "langchain-aws>=0.1.3",
-    "langchain-anthropic>=0.1.11",
     "langchain-mistralai>=0.1.12",
-    "langchain-huggingface>=0.0.3",
-    "langchain-nvidia-ai-endpoints>=0.1.6",
+    "langchain_community>=0.2.9",
+    "langchain-aws>=0.1.3",
     "html2text>=2024.2.26",
     "faiss-cpu>=1.8.0",
     "beautifulsoup4>=4.12.3",
     "pandas>=2.2.2",
     "python-dotenv>=1.0.1",
     "tiktoken>=0.7",
     "tqdm>=4.66.4",
-    "graphviz>=0.20.3",
     "minify-html>=0.15.0",
     "free-proxy>=1.1.1",
     "playwright>=1.43.0",
-    "google>=3.0.0",
     "undetected-playwright>=0.3.0",
+    "google>=3.0.0",
     "semchunk>=1.0.1",
-    "browserbase>=0.3.0",
 ]
 
 license = "MIT"
@@ -79,6 +66,25 @@ requires-python = ">=3.9,<4.0"
 burr = ["burr[start]==0.22.1"]
 docs = ["sphinx==6.0", "furo==2024.5.6"]
 
+# Group 1: Other Language Models
+other-language-models = [
+    "langchain-fireworks>=0.1.3",
+    "langchain-groq>=0.1.3",
+    "langchain-anthropic>=0.1.11",
+    "langchain-huggingface>=0.0.3",
+    "langchain-nvidia-ai-endpoints>=0.1.6",
+]
+
+# Group 2: More Semantic Options
+more-semantic-options = [
+    "graphviz>=0.20.3",
+]
+
+# Group 3: More Browser Options
+more-browser-options = [
+    "browserbase>=0.3.0",
+]
+
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -145,15 +145,18 @@ def handle_model(model_name, provider, token_key, default_token=8192):
                 warnings.simplefilter("ignore")
                 return init_chat_model(**llm_params)
 
-        known_models = {"chatgpt","gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"}
+        known_models = ["chatgpt","gpt","openai", "azure_openai", "google_genai",
+                         "ollama", "oneapi", "nvidia", "groq", "google_vertexai", 
+                         "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
+
+
         if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models:
             raise ValueError(f"Model '{llm_params['model']}' is not supported")
 
         try:
             if "azure" in llm_params["model"]:
                  model_name = llm_params["model"].split("/")[-1]
-                 return handle_model(model_name, "azure_openai", model_name)
-	        
+                 return handle_model(model_name, "azure_openai", model_name)	        
             if "fireworks" in llm_params["model"]:
                 model_name = "/".join(llm_params["model"].split("/")[1:])
                 token_key = llm_params["model"].split("/")[-1]
@@ -185,7 +188,6 @@ def handle_model(model_name, provider, token_key, default_token=8192):
                 model_name = llm_params["model"].split("/")[-1]
                 return handle_model(model_name, "mistralai", model_name)
 
-            # Instantiate the language model based on the model name (models that do not use the common interface)
             elif "deepseek" in llm_params["model"]:
                 try:
                     self.model_token = models_tokens["deepseek"][llm_params["model"]]
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -10,7 +10,6 @@
 from langchain_core.documents import Document
 from ..utils.cleanup_html import cleanup_html
 from ..docloaders import ChromiumLoader
-from ..docloaders.browser_base import browser_base_fetch
 from ..utils.convert_to_md import convert_to_md
 from ..utils.logging import get_logger
 from .base_node import BaseNode
@@ -269,6 +268,8 @@ def handle_web_source(self, state, source):
                 loader_kwargs = self.node_config.get("loader_kwargs", {})
 
             if self.browser_base is not None:
+                from ..docloaders.browser_base import browser_base_fetch
+
                 data =  browser_base_fetch(self.browser_base.get("api_key"),
                                             self.browser_base.get("project_id"), [source])