Skip to content

Commit 1b545c0

Browse files
authored
Merge pull request #418 from VinciGit00/temp
Pre/beta
2 parents 5d2e592 + b6a78e9 commit 1b545c0

28 files changed

+277
-108
lines changed

CHANGELOG.md

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,23 @@
1-
## [1.7.5](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.4...v1.7.5) (2024-06-28)
1+
## [1.8.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.4...v1.8.0-beta.1) (2024-06-25)
2+
3+
4+
### Features
5+
6+
* add new search engine avaiability and new tests ([073d226](https://github.com/VinciGit00/Scrapegraph-ai/commit/073d226723f5f03b960865d07408905b7a506180))
7+
* add research with bing + test function ([aa2160c](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa2160c108764745a696ffc16038f370e9702c14))
8+
29

310

411
### Bug Fixes
512

6-
* add new claude model ([4d93641](https://github.com/VinciGit00/Scrapegraph-ai/commit/4d936410ccaa3a4b810065e0e84b49b15c09fb28))
13+
* updated for schema changes ([aedda44](https://github.com/VinciGit00/Scrapegraph-ai/commit/aedda448682ce5a921a62e661bffb02478bab75f))
14+
15+
16+
### CI
17+
18+
* **release:** 1.7.0-beta.13 [skip ci] ([ce0a47a](https://github.com/VinciGit00/Scrapegraph-ai/commit/ce0a47aee5edbb26fd82e41f6688a4bc48a10822))
19+
* **release:** 1.7.0-beta.14 [skip ci] ([ec77ff7](https://github.com/VinciGit00/Scrapegraph-ai/commit/ec77ff7ea4eb071469c2fb53e5959d4ea1f73ad6))
20+
721

822
## [1.7.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.3...v1.7.4) (2024-06-21)
923

@@ -46,6 +60,7 @@
4660
## [1.7.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.6.1...v1.7.0) (2024-06-17)
4761

4862

63+
4964
### Features
5065

5166
* add caching ([d790361](https://github.com/VinciGit00/Scrapegraph-ai/commit/d79036149a3197a385b73553f29df66d36480c38))
@@ -143,6 +158,7 @@
143158
* **release:** 1.7.0-beta.8 [skip ci] ([a87702f](https://github.com/VinciGit00/Scrapegraph-ai/commit/a87702f107f3fd16ee73e1af1585cd763788bf46))
144159
* **release:** 1.7.0-beta.9 [skip ci] ([0c5d6e2](https://github.com/VinciGit00/Scrapegraph-ai/commit/0c5d6e2c82b9ee81c91cd2325948bb5a4eddcb31))
145160

161+
146162
## [1.7.0-beta.12](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.7.0-beta.11...v1.7.0-beta.12) (2024-06-17)
147163

148164

examples/ernie/smart_scraper_schema_ernie.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,31 @@
22
Basic example of scraping pipeline using SmartScraper with schema
33
"""
44

5-
import os, json
5+
import json
6+
import os
7+
from typing import Dict
8+
69
from dotenv import load_dotenv
10+
from pydantic import BaseModel
11+
712
from scrapegraphai.graphs import SmartScraperGraph
813

14+
915
load_dotenv()
1016

1117
# ************************************************
1218
# Define the output schema for the graph
1319
# ************************************************
1420

15-
schema= """
16-
{
17-
"Projects": [
18-
"Project #":
19-
{
20-
"title": "...",
21-
"description": "...",
22-
},
23-
"Project #":
24-
{
25-
"title": "...",
26-
"description": "...",
27-
}
28-
]
29-
}
30-
"""
21+
22+
class Project(BaseModel):
23+
title: str
24+
description: str
25+
26+
27+
class Projects(BaseModel):
28+
Projects: Dict[str, Project]
29+
3130

3231
# ************************************************
3332
# Define the configuration for the graph
@@ -37,7 +36,7 @@
3736

3837
graph_config = {
3938
"llm": {
40-
"api_key":openai_key,
39+
"api_key": openai_key,
4140
"model": "gpt-3.5-turbo",
4241
},
4342
"verbose": True,
@@ -51,8 +50,8 @@
5150
smart_scraper_graph = SmartScraperGraph(
5251
prompt="List me all the projects with their description",
5352
source="https://perinim.github.io/projects/",
54-
schema=schema,
55-
config=graph_config
53+
schema=Projects,
54+
config=graph_config,
5655
)
5756

5857
result = smart_scraper_graph.run()

examples/huggingfacehub/smart_scraper_schema_huggingfacehub.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
import os
66
from dotenv import load_dotenv
7+
from typing import Dict
8+
9+
from pydantic import BaseModel
710
from scrapegraphai.graphs import SmartScraperGraph
811
from scrapegraphai.utils import prettify_exec_info
912
from langchain_community.llms import HuggingFaceEndpoint
@@ -13,22 +16,12 @@
1316
# Define the output schema for the graph
1417
# ************************************************
1518

16-
schema= """
17-
{
18-
"Projects": [
19-
"Project #":
20-
{
21-
"title": "...",
22-
"description": "...",
23-
},
24-
"Project #":
25-
{
26-
"title": "...",
27-
"description": "...",
28-
}
29-
]
30-
}
31-
"""
19+
class Project(BaseModel):
20+
title: str
21+
description: str
22+
23+
class Projects(BaseModel):
24+
Projects: Dict[str, Project]
3225

3326
## required environment variable in .env
3427
#HUGGINGFACEHUB_API_TOKEN
@@ -61,7 +54,7 @@
6154
smart_scraper_graph = SmartScraperGraph(
6255
prompt="List me all the projects with their description",
6356
source="https://perinim.github.io/projects/",
64-
schema=schema,
57+
schema=Projects,
6558
config=graph_config
6659
)
6760
result = smart_scraper_graph.run()

examples/mixed_models/smart_scraper_schema_groq_openai.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,13 @@
22
Basic example of scraping pipeline using SmartScraper with schema
33
"""
44

5-
import os, json
5+
import json
6+
import os
7+
from typing import Dict, List
8+
69
from dotenv import load_dotenv
10+
from pydantic import BaseModel
11+
712
from scrapegraphai.graphs import SmartScraperGraph
813
from scrapegraphai.utils import prettify_exec_info
914

@@ -13,22 +18,12 @@
1318
# Define the output schema for the graph
1419
# ************************************************
1520

16-
schema= """
17-
{
18-
"Projects": [
19-
"Project #":
20-
{
21-
"title": "...",
22-
"description": "...",
23-
},
24-
"Project #":
25-
{
26-
"title": "...",
27-
"description": "...",
28-
}
29-
]
30-
}
31-
"""
21+
class Project(BaseModel):
22+
title: str
23+
description: str
24+
25+
class Projects(BaseModel):
26+
Projects: Dict[str, Project]
3227

3328
# ************************************************
3429
# Define the configuration for the graph
@@ -60,7 +55,7 @@
6055
prompt="List me all the projects with their description.",
6156
# also accepts a string with the already downloaded HTML code
6257
source="https://perinim.github.io/projects/",
63-
schema=schema,
58+
schema=Projects,
6459
config=graph_config
6560
)
6661

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
from scrapegraphai.models import Ollama
6+
from scrapegraphai.nodes import SearchInternetNode
7+
8+
# ************************************************
9+
# Define the configuration for the graph
10+
# ************************************************
11+
12+
graph_config = {
13+
"llm": {
14+
"model": "llama3",
15+
"temperature": 0,
16+
"streaming": True
17+
},
18+
"search_engine": "google",
19+
"max_results": 3,
20+
"verbose": True
21+
}
22+
23+
# ************************************************
24+
# Define the node
25+
# ************************************************
26+
27+
llm_model = Ollama(graph_config["llm"])
28+
29+
search_node = SearchInternetNode(
30+
input="user_input",
31+
output=["search_results"],
32+
node_config={
33+
"llm_model": llm_model,
34+
"search_engine": graph_config["search_engine"],
35+
"max_results": graph_config["max_results"],
36+
"verbose": graph_config["verbose"]
37+
}
38+
)
39+
40+
# ************************************************
41+
# Test the node
42+
# ************************************************
43+
44+
state = {
45+
"user_input": "What is the capital of France?"
46+
}
47+
48+
result = search_node.execute(state)
49+
50+
print(result)

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
name = "scrapegraphai"
33

44

5-
version = "1.7.5"
6-
5+
version = "1.8.0b1"
76

87

98
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."

scrapegraphai/builders/graph_builder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ class GraphBuilder:
4040
ValueError: If 'api_key' is not included in llm_config.
4141
"""
4242

43-
def __init__(self, user_prompt: str, config: dict):
43+
def __init__(self, prompt: str, config: dict):
4444
"""
4545
Initializes the GraphBuilder with a user prompt and language model configuration.
4646
"""
47-
self.user_prompt = user_prompt
47+
self.prompt = prompt
4848
self.config = config
4949
self.llm = self._create_llm(config["llm"])
5050
self.nodes_description = self._generate_nodes_description()
@@ -122,7 +122,7 @@ def build_graph(self):
122122
Returns:
123123
dict: A JSON representation of the graph configuration.
124124
"""
125-
return self.chain.invoke(self.user_prompt)
125+
return self.chain.invoke(self.prompt)
126126

127127
@staticmethod
128128
def convert_json_to_graphviz(json_data, format: str = 'pdf'):

scrapegraphai/graphs/abstract_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class AbstractGraph(ABC):
3939
prompt (str): The prompt for the graph.
4040
source (str): The source of the graph.
4141
config (dict): Configuration parameters for the graph.
42-
schema (str): The schema for the graph output.
42+
schema (BaseModel): The schema for the graph output.
4343
llm_model: An instance of a language model client, configured for generating answers.
4444
embedder_model: An instance of an embedding model client,
4545
configured for generating embeddings.

scrapegraphai/graphs/csv_scraper_multi_graph.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from copy import copy, deepcopy
66
from typing import List, Optional
77

8+
from pydantic import BaseModel
9+
810
from .base_graph import BaseGraph
911
from .abstract_graph import AbstractGraph
1012
from .csv_scraper_graph import CSVScraperGraph
@@ -32,7 +34,7 @@ class CSVScraperMultiGraph(AbstractGraph):
3234
prompt (str): The user prompt to search the internet.
3335
source (List[str]): The source of the graph.
3436
config (dict): Configuration parameters for the graph.
35-
schema (Optional[str]): The schema for the graph output.
37+
schema (Optional[BaseModel]): The schema for the graph output.
3638
3739
Example:
3840
>>> search_graph = MultipleSearchGraph(
@@ -42,7 +44,7 @@ class CSVScraperMultiGraph(AbstractGraph):
4244
>>> result = search_graph.run()
4345
"""
4446

45-
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
47+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
4648

4749
self.max_results = config.get("max_results", 3)
4850

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class DeepScraperGraph(AbstractGraph):
3434
prompt (str): The prompt for the graph.
3535
source (str): The source of the graph.
3636
config (dict): Configuration parameters for the graph.
37-
schema (str): The schema for the graph output.
37+
schema (BaseModel): The schema for the graph output.
3838
llm_model: An instance of a language model client, configured for generating answers.
3939
embedder_model: An instance of an embedding model client,
4040
configured for generating embeddings.
@@ -45,7 +45,7 @@ class DeepScraperGraph(AbstractGraph):
4545
prompt (str): The prompt for the graph.
4646
source (str): The source of the graph.
4747
config (dict): Configuration parameters for the graph.
48-
schema (str): The schema for the graph output.
48+
schema (BaseModel): The schema for the graph output.
4949
5050
Example:
5151
>>> deep_scraper = DeepScraperGraph(

scrapegraphai/graphs/json_scraper_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class JSONScraperGraph(AbstractGraph):
2323
prompt (str): The prompt for the graph.
2424
source (str): The source of the graph.
2525
config (dict): Configuration parameters for the graph.
26-
schema (str): The schema for the graph output.
26+
schema (BaseModel): The schema for the graph output.
2727
llm_model: An instance of a language model client, configured for generating answers.
2828
embedder_model: An instance of an embedding model client,
2929
configured for generating embeddings.
@@ -34,7 +34,7 @@ class JSONScraperGraph(AbstractGraph):
3434
prompt (str): The prompt for the graph.
3535
source (str): The source of the graph.
3636
config (dict): Configuration parameters for the graph.
37-
schema (str): The schema for the graph output.
37+
schema (BaseModel): The schema for the graph output.
3838
3939
Example:
4040
>>> json_scraper = JSONScraperGraph(

scrapegraphai/graphs/json_scraper_multi_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class JSONScraperMultiGraph(AbstractGraph):
3333
prompt (str): The user prompt to search the internet.
3434
source (List[str]): The source of the graph.
3535
config (dict): Configuration parameters for the graph.
36-
schema (Optional[str]): The schema for the graph output.
36+
schema (Optional[BaseModel]): The schema for the graph output.
3737
3838
Example:
3939
>>> search_graph = MultipleSearchGraph(

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class OmniScraperGraph(AbstractGraph):
2929
prompt (str): The prompt for the graph.
3030
source (str): The source of the graph.
3131
config (dict): Configuration parameters for the graph.
32-
schema (str): The schema for the graph output.
32+
schema (BaseModel): The schema for the graph output.
3333
llm_model: An instance of a language model client, configured for generating answers.
3434
embedder_model: An instance of an embedding model client,
3535
configured for generating embeddings.
@@ -41,7 +41,7 @@ class OmniScraperGraph(AbstractGraph):
4141
prompt (str): The prompt for the graph.
4242
source (str): The source of the graph.
4343
config (dict): Configuration parameters for the graph.
44-
schema (str): The schema for the graph output.
44+
schema (BaseModel): The schema for the graph output.
4545
4646
Example:
4747
>>> omni_scraper = OmniScraperGraph(

0 commit comments

Comments
 (0)