Skip to content

Commit a4700bf

Browse files
committed
add robot node
1 parent 0589083 commit a4700bf

File tree

10 files changed

+596
-53
lines changed

10 files changed

+596
-53
lines changed

examples/single_node/robot_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
graph_config = {
1313
"llm": {
14-
"model": "ollama/llama3",
14+
"model_name": "ollama/llama3",
1515
"temperature": 0,
1616
"streaming": True
1717
},

poetry.lock

Lines changed: 13 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scrapegraphai/nodes/robots_node.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,9 @@ def execute(self, state: dict) -> dict:
6161
ValueError: If the website is not scrapeable based on the robots.txt file and
6262
scraping is not enforced.
6363
"""
64-
logger = get_logger("robots node")
6564

6665
if self.verbose:
67-
logger.info(f"--- Executing {self.node_name} Node ---")
66+
self.logger.info(f"--- Executing {self.node_name} Node ---")
6867

6968
# Interpret input keys based on the provided input expression
7069
input_keys = self.get_input_keys(state)
@@ -97,12 +96,12 @@ def execute(self, state: dict) -> dict:
9796
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
9897
loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
9998
document = loader.load()
100-
if "ollama" in self.llm_model.model_name:
101-
self.llm_model.model_name = self.llm_model.model_name.split("/")[-1]
102-
model = self.llm_model.model_name.split("/")[-1]
99+
if "ollama" in self.llm_model["model_name"]:
100+
self.llm_model["model_name"] = self.llm_model["model_name"].split("/")[-1]
101+
model = self.llm_model["model_name"].split("/")[-1]
103102

104103
else:
105-
model = self.llm_model.model_name
104+
model = self.llm_model["model_name"]
106105
try:
107106
agent = robots_dictionary[model]
108107

tests/nodes/fetch_node_test.py

Lines changed: 85 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,11 @@
1-
"""
2-
Module for testinh fetch_node
3-
"""
1+
import os
42
import pytest
53
from scrapegraphai.nodes import FetchNode
64

7-
8-
@pytest.fixture
9-
def setup():
5+
def test_fetch_node_html():
106
"""
11-
setup
7+
Run the tests
128
"""
13-
# ************************************************
14-
# Define the node
15-
# ************************************************
16-
179
fetch_node = FetchNode(
1810
input="url | local_dir",
1911
output=["doc"],
@@ -22,21 +14,94 @@ def setup():
2214
}
2315
)
2416

25-
return fetch_node
17+
state = {
18+
"url": "https://twitter.com/home"
19+
}
2620

27-
# ************************************************
28-
# Test the node
29-
# ************************************************
21+
result = fetch_node.execute(state)
3022

23+
assert result is not None
3124

32-
def test_fetch_node(setup):
25+
def test_fetch_node_json():
3326
"""
3427
Run the tests
3528
"""
36-
state = {
37-
"url": "https://twitter.com/home"
29+
FILE_NAME_JSON = "inputs/example.json"
30+
curr_dir = os.path.dirname(os.path.realpath(__file__))
31+
file_path_json = os.path.join(curr_dir, FILE_NAME_JSON)
32+
33+
state_json = {
34+
"json": file_path_json
35+
}
36+
37+
fetch_node_json = FetchNode(
38+
input="json",
39+
output=["doc"],
40+
)
41+
42+
result_json = fetch_node_json.execute(state_json)
43+
44+
assert result_json is not None
45+
46+
def test_fetch_node_xml():
47+
"""
48+
Run the tests
49+
"""
50+
FILE_NAME_XML = "inputs/books.xml"
51+
curr_dir = os.path.dirname(os.path.realpath(__file__))
52+
file_path_xml = os.path.join(curr_dir, FILE_NAME_XML)
53+
54+
state_xml = {
55+
"xml": file_path_xml
3856
}
3957

40-
result = setup.execute(state)
58+
fetch_node_xml = FetchNode(
59+
input="xml",
60+
output=["doc"],
61+
)
4162

42-
assert result is not None
63+
result_xml = fetch_node_xml.execute(state_xml)
64+
65+
assert result_xml is not None
66+
67+
def test_fetch_node_csv():
68+
"""
69+
Run the tests
70+
"""
71+
FILE_NAME_CSV = "inputs/username.csv"
72+
curr_dir = os.path.dirname(os.path.realpath(__file__))
73+
file_path_csv = os.path.join(curr_dir, FILE_NAME_CSV)
74+
75+
state_csv = {
76+
"csv": file_path_csv # Definire un dizionario con la chiave "csv" e il valore come percorso del file CSV
77+
}
78+
79+
fetch_node_csv = FetchNode(
80+
input="csv",
81+
output=["doc"],
82+
)
83+
84+
result_csv = fetch_node_csv.execute(state_csv)
85+
86+
assert result_csv is not None
87+
88+
def test_fetch_node_txt():
89+
"""
90+
Run the tests
91+
"""
92+
FILE_NAME_TXT = "inputs/plain_html_example.txt"
93+
curr_dir = os.path.dirname(os.path.realpath(__file__))
94+
file_path_txt = os.path.join(curr_dir, FILE_NAME_TXT)
95+
96+
state_txt = {
97+
"txt": file_path_txt # Definire un dizionario con la chiave "txt" e il valore come percorso del file TXT
98+
}
99+
100+
fetch_node_txt = FetchNode(
101+
input="txt",
102+
output=["doc"],
103+
)
104+
105+
result_txt = fetch_node_txt.execute(state_txt)
106+
107+
assert result_txt is not None

tests/nodes/inputs/books.xml

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
<?xml version="1.0"?>
2+
<catalog>
3+
<book id="bk101">
4+
<author>Gambardella, Matthew</author>
5+
<title>XML Developer's Guide</title>
6+
<genre>Computer</genre>
7+
<price>44.95</price>
8+
<publish_date>2000-10-01</publish_date>
9+
<description>An in-depth look at creating applications
10+
with XML.</description>
11+
</book>
12+
<book id="bk102">
13+
<author>Ralls, Kim</author>
14+
<title>Midnight Rain</title>
15+
<genre>Fantasy</genre>
16+
<price>5.95</price>
17+
<publish_date>2000-12-16</publish_date>
18+
<description>A former architect battles corporate zombies,
19+
an evil sorceress, and her own childhood to become queen
20+
of the world.</description>
21+
</book>
22+
<book id="bk103">
23+
<author>Corets, Eva</author>
24+
<title>Maeve Ascendant</title>
25+
<genre>Fantasy</genre>
26+
<price>5.95</price>
27+
<publish_date>2000-11-17</publish_date>
28+
<description>After the collapse of a nanotechnology
29+
society in England, the young survivors lay the
30+
foundation for a new society.</description>
31+
</book>
32+
<book id="bk104">
33+
<author>Corets, Eva</author>
34+
<title>Oberon's Legacy</title>
35+
<genre>Fantasy</genre>
36+
<price>5.95</price>
37+
<publish_date>2001-03-10</publish_date>
38+
<description>In post-apocalypse England, the mysterious
39+
agent known only as Oberon helps to create a new life
40+
for the inhabitants of London. Sequel to Maeve
41+
Ascendant.</description>
42+
</book>
43+
<book id="bk105">
44+
<author>Corets, Eva</author>
45+
<title>The Sundered Grail</title>
46+
<genre>Fantasy</genre>
47+
<price>5.95</price>
48+
<publish_date>2001-09-10</publish_date>
49+
<description>The two daughters of Maeve, half-sisters,
50+
battle one another for control of England. Sequel to
51+
Oberon's Legacy.</description>
52+
</book>
53+
<book id="bk106">
54+
<author>Randall, Cynthia</author>
55+
<title>Lover Birds</title>
56+
<genre>Romance</genre>
57+
<price>4.95</price>
58+
<publish_date>2000-09-02</publish_date>
59+
<description>When Carla meets Paul at an ornithology
60+
conference, tempers fly as feathers get ruffled.</description>
61+
</book>
62+
<book id="bk107">
63+
<author>Thurman, Paula</author>
64+
<title>Splish Splash</title>
65+
<genre>Romance</genre>
66+
<price>4.95</price>
67+
<publish_date>2000-11-02</publish_date>
68+
<description>A deep sea diver finds true love twenty
69+
thousand leagues beneath the sea.</description>
70+
</book>
71+
<book id="bk108">
72+
<author>Knorr, Stefan</author>
73+
<title>Creepy Crawlies</title>
74+
<genre>Horror</genre>
75+
<price>4.95</price>
76+
<publish_date>2000-12-06</publish_date>
77+
<description>An anthology of horror stories about roaches,
78+
centipedes, scorpions and other insects.</description>
79+
</book>
80+
<book id="bk109">
81+
<author>Kress, Peter</author>
82+
<title>Paradox Lost</title>
83+
<genre>Science Fiction</genre>
84+
<price>6.95</price>
85+
<publish_date>2000-11-02</publish_date>
86+
<description>After an inadvertant trip through a Heisenberg
87+
Uncertainty Device, James Salway discovers the problems
88+
of being quantum.</description>
89+
</book>
90+
<book id="bk110">
91+
<author>O'Brien, Tim</author>
92+
<title>Microsoft .NET: The Programming Bible</title>
93+
<genre>Computer</genre>
94+
<price>36.95</price>
95+
<publish_date>2000-12-09</publish_date>
96+
<description>Microsoft's .NET initiative is explored in
97+
detail in this deep programmer's reference.</description>
98+
</book>
99+
<book id="bk111">
100+
<author>O'Brien, Tim</author>
101+
<title>MSXML3: A Comprehensive Guide</title>
102+
<genre>Computer</genre>
103+
<price>36.95</price>
104+
<publish_date>2000-12-01</publish_date>
105+
<description>The Microsoft MSXML3 parser is covered in
106+
detail, with attention to XML DOM interfaces, XSLT processing,
107+
SAX and more.</description>
108+
</book>
109+
<book id="bk112">
110+
<author>Galos, Mike</author>
111+
<title>Visual Studio 7: A Comprehensive Guide</title>
112+
<genre>Computer</genre>
113+
<price>49.95</price>
114+
<publish_date>2001-04-16</publish_date>
115+
<description>Microsoft Visual Studio 7 is explored in depth,
116+
looking at how Visual Basic, Visual C++, C#, and ASP+ are
117+
integrated into a comprehensive development
118+
environment.</description>
119+
</book>
120+
</catalog>

0 commit comments

Comments
 (0)