Skip to content

Commit 3ee6f58

Browse files
authored
Merge pull request #168 from VinciGit00/asdt
Asdt
2 parents e1b9d69 + b326886 commit 3ee6f58

File tree

14 files changed

+1258
-3
lines changed

14 files changed

+1258
-3
lines changed

examples/custom_graph_domtree.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.models import OpenAI
8+
from scrapegraphai.graphs import BaseGraph
9+
from scrapegraphai.nodes import FetchNode, GenerateAnswerNode
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
openai_key = os.getenv("OPENAI_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": openai_key,
21+
"model": "gpt-3.5-turbo",
22+
"temperature": 0,
23+
"streaming": True
24+
},
25+
}
26+
27+
# ************************************************
28+
# Define the graph nodes
29+
# ************************************************
30+
31+
llm_model = OpenAI(graph_config["llm"])
32+
33+
# define the nodes for the graph
34+
fetch_node = FetchNode(
35+
input="url | local_dir",
36+
output=["doc"],
37+
)
38+
generate_answer_node = GenerateAnswerNode(
39+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
40+
output=["answer"],
41+
node_config={"llm": llm_model},
42+
)
43+
44+
# ************************************************
45+
# Create the graph by defining the connections
46+
# ************************************************
47+
48+
graph = BaseGraph(
49+
nodes={
50+
fetch_node,
51+
generate_answer_node,
52+
},
53+
edges={
54+
(fetch_node, generate_answer_node)
55+
},
56+
entry_point=fetch_node
57+
)
58+
59+
# ************************************************
60+
# Execute the graph
61+
# ************************************************
62+
63+
subtree_text = '''
64+
div>div -> "This is a paragraph" \n
65+
div>ul>li>a>span -> "This is a list item 1" \n
66+
div>ul>li>a>span -> "This is a list item 2" \n
67+
div>ul>li>a>span -> "This is a list item 3"
68+
'''
69+
70+
subtree_simplified_html = '''
71+
<div>
72+
<div>This is a paragraph</div>
73+
<ul>
74+
<li>
75+
<span>This is a list item 1</span>
76+
</li>
77+
<li>
78+
<span>This is a list item 2</span>
79+
</li>
80+
<li>
81+
<span>This is a list item 3</span>
82+
</li>
83+
</ul>
84+
</div>
85+
'''
86+
87+
subtree_dict_simple = {
88+
"div": {
89+
"text": {
90+
"content": "This is a paragraph",
91+
"path_to_fork": "div>div",
92+
},
93+
"ul": {
94+
"path_to_fork": "div>ul",
95+
"texts": [
96+
{
97+
"content": "This is a list item 1",
98+
"path_to_fork": "ul>li>a>span",
99+
},
100+
{
101+
"content": "This is a list item 2",
102+
"path_to_fork": "ul>li>a>span",
103+
},
104+
{
105+
"content": "This is a list item 3",
106+
"path_to_fork": "ul>li>a>span",
107+
}
108+
]
109+
}
110+
}
111+
}
112+
113+
114+
subtree_dict_complex = {
115+
"div": {
116+
"text": {
117+
"content": "This is a paragraph",
118+
"path_to_fork": "div>div",
119+
"attributes": {
120+
"classes": ["paragraph"],
121+
"ids": ["paragraph"],
122+
"hrefs": ["https://www.example.com"]
123+
}
124+
},
125+
"ul": {
126+
"text1":{
127+
"content": "This is a list item 1",
128+
"path_to_fork": "ul>li>a>span",
129+
"attributes": {
130+
"classes": ["list-item", "item-1"],
131+
"ids": ["item-1"],
132+
"hrefs": ["https://www.example.com"]
133+
}
134+
},
135+
"text2":{
136+
"content": "This is a list item 2",
137+
"path_to_fork": "ul>li>a>span",
138+
"attributes": {
139+
"classes": ["list-item", "item-2"],
140+
"ids": ["item-2"],
141+
"hrefs": ["https://www.example.com"]
142+
}
143+
}
144+
}
145+
}
146+
}
147+
148+
from playwright.sync_api import sync_playwright, Playwright
149+
150+
def run(playwright: Playwright):
151+
chromium = playwright.chromium # or "firefox" or "webkit".
152+
browser = chromium.launch()
153+
page = browser.new_page()
154+
page.goto("https://www.wired.com/category/science/")
155+
#get accessibilty tree
156+
accessibility_tree = page.accessibility.snapshot()
157+
158+
result, execution_info = graph.execute({
159+
"user_prompt": "List me all the latest news with their description.",
160+
"local_dir": str(accessibility_tree)
161+
})
162+
163+
# get the answer from the result
164+
result = result.get("answer", "No answer found.")
165+
print(result)
166+
# other actions...
167+
browser.close()
168+
169+
with sync_playwright() as playwright:
170+
run(playwright)
171+

examples/domtree_example.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
from langchain_community.document_loaders import AsyncHtmlLoader
2+
import time
3+
from scrapegraphai.asdt import DOMTree
4+
5+
def index_subtrees(subtrees):
6+
from collections import defaultdict
7+
structure_index = defaultdict(list)
8+
content_index = defaultdict(list)
9+
10+
for subtree in subtrees:
11+
structure_hash = subtree.root.structure_hash
12+
content_hash = subtree.root.content_hash
13+
14+
structure_index[structure_hash].append(subtree)
15+
content_index[content_hash].append(subtree)
16+
17+
return structure_index, content_index
18+
19+
def find_matching_subtrees(index):
20+
matches = []
21+
for hash_key, subtrees in index.items():
22+
if len(subtrees) > 1:
23+
# Generate pairs of matched subtrees
24+
for i in range(len(subtrees)):
25+
for j in range(i + 1, len(subtrees)):
26+
matches.append((subtrees[i], subtrees[j]))
27+
return matches
28+
29+
def print_subtree_details(subtree):
30+
""" A helper function to print subtree details for comparison. """
31+
nodes = []
32+
subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
33+
return " | ".join(nodes)
34+
35+
def print_matches_side_by_side(matches):
36+
for match_pair in matches:
37+
subtree1, subtree2 = match_pair
38+
subtree1_details = print_subtree_details(subtree1)
39+
subtree2_details = print_subtree_details(subtree2)
40+
print("Match Pair:")
41+
print("Subtree 1:", subtree1_details)
42+
print("Subtree 2:", subtree2_details)
43+
print("\n" + "-"*100 + "\n")
44+
45+
# *********************************************************************************************************************
46+
# Usage example:
47+
# *********************************************************************************************************************
48+
49+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
50+
document = loader.load()
51+
html_content = document[0].page_content
52+
53+
curr_time = time.time()
54+
# Instantiate a DOMTree with HTML content
55+
dom_tree = DOMTree(html_content)
56+
# nodes, metadatas = dom_tree.collect_text_nodes() # Collect text nodes for analysis
57+
# for node, metadata in zip(nodes, metadatas):
58+
# print("Text:", node)
59+
# print("Metadata:", metadata)
60+
61+
# sub_list = dom_tree.generate_subtree_dicts() # Generate subtree dictionaries for analysis
62+
# print(sub_list)
63+
# graph = dom_tree.visualize(exclude_tags=['script', 'style', 'meta', 'link'])
64+
subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
65+
print("Number of subtrees found:", len(subtrees))
66+
67+
# remove trees whos root node does not lead to any text
68+
text_subtrees = [subtree for subtree in subtrees if subtree.root.leads_to_text]
69+
print("Number of subtrees that lead to text:", len(text_subtrees))
70+
71+
direct_leaf_subtrees = [subtree for subtree in text_subtrees if subtree.root.has_direct_leaves]
72+
print("Number of subtrees with direct leaves beneath fork nodes:", len(direct_leaf_subtrees))
73+
74+
for subtree in direct_leaf_subtrees:
75+
print("Subtree rooted at:", subtree.root.value)
76+
subtree.traverse(lambda node: print(node))
77+
# Index subtrees by structure and content
78+
# structure_index, content_index = index_subtrees(subtrees)
79+
80+
# # Find matches based on structure
81+
# structure_matches = find_matching_subtrees(structure_index)
82+
# print("Structure-based matches found:", len(structure_matches))
83+
84+
# # Print structure-based matches side by side
85+
# print_matches_side_by_side(structure_matches)
86+
87+
# # Optionally, do the same for content-based matches if needed
88+
# content_matches = find_matching_subtrees(content_index)
89+
# print("Content-based matches found:", len(content_matches))
90+
# print_matches_side_by_side(content_matches)
91+
92+
print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
93+
94+
# Optionally, traverse each subtree
95+
# for subtree in subtrees:
96+
# print("Subtree rooted at:", subtree.root.value)
97+
# subtree.traverse(lambda node: print(node))
98+
# Traverse the DOMTree and print each node
99+
# dom_tree.traverse(lambda node: print(node))

examples/faiss_vector.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from langchain_community.document_loaders import TextLoader
2+
from langchain_community.vectorstores import FAISS
3+
from langchain_openai import OpenAIEmbeddings
4+
from langchain_text_splitters import CharacterTextSplitter
5+
from langchain_community.document_loaders import AsyncHtmlLoader
6+
import time
7+
from scrapegraphai.asdt import DOMTree
8+
from dotenv import load_dotenv
9+
import os
10+
11+
load_dotenv()
12+
openai_key = os.getenv("OPENAI_APIKEY")
13+
embeddings = OpenAIEmbeddings(api_key=openai_key)
14+
15+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
16+
document = loader.load()
17+
html_content = document[0].page_content
18+
19+
curr_time = time.time()
20+
# Instantiate a DOMTree with HTML content
21+
dom_tree = DOMTree(html_content)
22+
text_nodes, metadata = dom_tree.collect_text_nodes() # Collect text nodes for analysis
23+
24+
print(f"Time taken to collect text nodes: {time.time() - curr_time}")
25+
26+
db_texts = FAISS.from_texts(
27+
texts=text_nodes,
28+
embedding=embeddings,
29+
metadatas=metadata
30+
)
31+
32+
# Query for similar text
33+
query = "List me all the projects"
34+

0 commit comments

Comments
 (0)