Skip to content

Commit e0a5e73

Browse files
authored
Merge pull request #573 from ScrapeGraphAI/ligthweigthing_library
Ligthweigthing library
2 parents f7ba1f3 + 26de5dd commit e0a5e73

File tree

4 files changed

+31
-33
lines changed

4 files changed

+31
-33
lines changed

docs/README.md

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,6 @@ markmap:
99

1010
## **Short-Term Goals**
1111

12-
- Integration with more llm APIs
13-
14-
- Test proxy rotation implementation
15-
16-
- Add more search engines inside the SearchInternetNode
17-
1812
- Improve the documentation (ReadTheDocs)
1913
- [Issue #102](https://github.com/VinciGit00/Scrapegraph-ai/issues/102)
2014

@@ -23,9 +17,6 @@ markmap:
2317
## **Medium-Term Goals**
2418

2519
- Node for handling API requests
26-
27-
- Improve SearchGraph to look into the first 5 results of the search engine
28-
2920
- Make scraping more deterministic
3021
- Create DOM tree of the website
3122
- HTML tag text embeddings with tags metadata
@@ -70,6 +61,4 @@ markmap:
7061

7162
- Automatic generation of scraping pipelines from a given prompt
7263

73-
- Create API for the library
74-
75-
- Finetune a LLM for html content
64+
- Create API for the library

pyproject.toml

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
11
[project]
22
name = "scrapegraphai"
3-
4-
53
version = "1.14.1b1"
6-
7-
84
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
9-
105
authors = [
116
{ name = "Marco Vinciguerra", email = "[email protected]" },
127
{ name = "Marco Perini", email = "[email protected]" },
@@ -15,32 +10,24 @@ authors = [
1510

1611
dependencies = [
1712
"langchain>=0.2.14",
18-
"langchain-fireworks>=0.1.3",
19-
"langchain_community>=0.2.9",
2013
"langchain-google-genai>=1.0.7",
21-
"langchain-google-vertexai>=1.0.7",
2214
"langchain-openai>=0.1.22",
23-
"langchain-groq>=0.1.3",
24-
"langchain-aws>=0.1.3",
25-
"langchain-anthropic>=0.1.11",
2615
"langchain-mistralai>=0.1.12",
27-
"langchain-huggingface>=0.0.3",
28-
"langchain-nvidia-ai-endpoints>=0.1.6",
16+
"langchain_community>=0.2.9",
17+
"langchain-aws>=0.1.3",
2918
"html2text>=2024.2.26",
3019
"faiss-cpu>=1.8.0",
3120
"beautifulsoup4>=4.12.3",
3221
"pandas>=2.2.2",
3322
"python-dotenv>=1.0.1",
3423
"tiktoken>=0.7",
3524
"tqdm>=4.66.4",
36-
"graphviz>=0.20.3",
3725
"minify-html>=0.15.0",
3826
"free-proxy>=1.1.1",
3927
"playwright>=1.43.0",
40-
"google>=3.0.0",
4128
"undetected-playwright>=0.3.0",
29+
"google>=3.0.0",
4230
"semchunk>=1.0.1",
43-
"browserbase>=0.3.0",
4431
]
4532

4633
license = "MIT"
@@ -79,6 +66,25 @@ requires-python = ">=3.9,<4.0"
7966
burr = ["burr[start]==0.22.1"]
8067
docs = ["sphinx==6.0", "furo==2024.5.6"]
8168

69+
# Group 1: Other Language Models
70+
other-language-models = [
71+
"langchain-fireworks>=0.1.3",
72+
"langchain-groq>=0.1.3",
73+
"langchain-anthropic>=0.1.11",
74+
"langchain-huggingface>=0.0.3",
75+
"langchain-nvidia-ai-endpoints>=0.1.6",
76+
]
77+
78+
# Group 2: More Semantic Options
79+
more-semantic-options = [
80+
"graphviz>=0.20.3",
81+
]
82+
83+
# Group 3: More Browser Options
84+
more-browser-options = [
85+
"browserbase>=0.3.0",
86+
]
87+
8288
[build-system]
8389
requires = ["hatchling"]
8490
build-backend = "hatchling.build"

scrapegraphai/graphs/abstract_graph.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,15 +145,18 @@ def handle_model(model_name, provider, token_key, default_token=8192):
145145
warnings.simplefilter("ignore")
146146
return init_chat_model(**llm_params)
147147

148-
known_models = {"chatgpt","gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"}
148+
known_models = ["chatgpt","gpt","openai", "azure_openai", "google_genai",
149+
"ollama", "oneapi", "nvidia", "groq", "google_vertexai",
150+
"bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"]
151+
152+
149153
if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models:
150154
raise ValueError(f"Model '{llm_params['model']}' is not supported")
151155

152156
try:
153157
if "azure" in llm_params["model"]:
154158
model_name = llm_params["model"].split("/")[-1]
155-
return handle_model(model_name, "azure_openai", model_name)
156-
159+
return handle_model(model_name, "azure_openai", model_name)
157160
if "fireworks" in llm_params["model"]:
158161
model_name = "/".join(llm_params["model"].split("/")[1:])
159162
token_key = llm_params["model"].split("/")[-1]
@@ -185,7 +188,6 @@ def handle_model(model_name, provider, token_key, default_token=8192):
185188
model_name = llm_params["model"].split("/")[-1]
186189
return handle_model(model_name, "mistralai", model_name)
187190

188-
# Instantiate the language model based on the model name (models that do not use the common interface)
189191
elif "deepseek" in llm_params["model"]:
190192
try:
191193
self.model_token = models_tokens["deepseek"][llm_params["model"]]

scrapegraphai/nodes/fetch_node.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
from langchain_core.documents import Document
1111
from ..utils.cleanup_html import cleanup_html
1212
from ..docloaders import ChromiumLoader
13-
from ..docloaders.browser_base import browser_base_fetch
1413
from ..utils.convert_to_md import convert_to_md
1514
from ..utils.logging import get_logger
1615
from .base_node import BaseNode
@@ -269,6 +268,8 @@ def handle_web_source(self, state, source):
269268
loader_kwargs = self.node_config.get("loader_kwargs", {})
270269

271270
if self.browser_base is not None:
271+
from ..docloaders.browser_base import browser_base_fetch
272+
272273
data = browser_base_fetch(self.browser_base.get("api_key"),
273274
self.browser_base.get("project_id"), [source])
274275

0 commit comments

Comments
 (0)