Skip to content

Asdt #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 7, 2024
Merged

Asdt #168

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions examples/custom_graph_domtree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
"""
Example of custom graph using existing nodes
"""

import os
from dotenv import load_dotenv
from scrapegraphai.models import OpenAI
from scrapegraphai.graphs import BaseGraph
from scrapegraphai.nodes import FetchNode, GenerateAnswerNode
load_dotenv()

# ************************************************
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")

graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-3.5-turbo",
"temperature": 0,
"streaming": True
},
}

# ************************************************
# Define the graph nodes
# ************************************************

llm_model = OpenAI(graph_config["llm"])

# define the nodes for the graph
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
)
generate_answer_node = GenerateAnswerNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={"llm": llm_model},
)

# ************************************************
# Create the graph by defining the connections
# ************************************************

graph = BaseGraph(
nodes={
fetch_node,
generate_answer_node,
},
edges={
(fetch_node, generate_answer_node)
},
entry_point=fetch_node
)

# ************************************************
# Execute the graph
# ************************************************

subtree_text = '''
div>div -> "This is a paragraph" \n
div>ul>li>a>span -> "This is a list item 1" \n
div>ul>li>a>span -> "This is a list item 2" \n
div>ul>li>a>span -> "This is a list item 3"
'''

subtree_simplified_html = '''
<div>
<div>This is a paragraph</div>
<ul>
<li>
<span>This is a list item 1</span>
</li>
<li>
<span>This is a list item 2</span>
</li>
<li>
<span>This is a list item 3</span>
</li>
</ul>
</div>
'''

subtree_dict_simple = {
"div": {
"text": {
"content": "This is a paragraph",
"path_to_fork": "div>div",
},
"ul": {
"path_to_fork": "div>ul",
"texts": [
{
"content": "This is a list item 1",
"path_to_fork": "ul>li>a>span",
},
{
"content": "This is a list item 2",
"path_to_fork": "ul>li>a>span",
},
{
"content": "This is a list item 3",
"path_to_fork": "ul>li>a>span",
}
]
}
}
}


subtree_dict_complex = {
"div": {
"text": {
"content": "This is a paragraph",
"path_to_fork": "div>div",
"attributes": {
"classes": ["paragraph"],
"ids": ["paragraph"],
"hrefs": ["https://www.example.com"]
}
},
"ul": {
"text1":{
"content": "This is a list item 1",
"path_to_fork": "ul>li>a>span",
"attributes": {
"classes": ["list-item", "item-1"],
"ids": ["item-1"],
"hrefs": ["https://www.example.com"]
}
},
"text2":{
"content": "This is a list item 2",
"path_to_fork": "ul>li>a>span",
"attributes": {
"classes": ["list-item", "item-2"],
"ids": ["item-2"],
"hrefs": ["https://www.example.com"]
}
}
}
}
}

from playwright.sync_api import sync_playwright, Playwright

def run(playwright: Playwright):
chromium = playwright.chromium # or "firefox" or "webkit".
browser = chromium.launch()
page = browser.new_page()
page.goto("https://www.wired.com/category/science/")
#get accessibilty tree
accessibility_tree = page.accessibility.snapshot()

result, execution_info = graph.execute({
"user_prompt": "List me all the latest news with their description.",
"local_dir": str(accessibility_tree)
})

# get the answer from the result
result = result.get("answer", "No answer found.")
print(result)
# other actions...
browser.close()

with sync_playwright() as playwright:
run(playwright)

99 changes: 99 additions & 0 deletions examples/domtree_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from langchain_community.document_loaders import AsyncHtmlLoader
import time
from scrapegraphai.asdt import DOMTree

def index_subtrees(subtrees):
from collections import defaultdict
structure_index = defaultdict(list)
content_index = defaultdict(list)

for subtree in subtrees:
structure_hash = subtree.root.structure_hash
content_hash = subtree.root.content_hash

structure_index[structure_hash].append(subtree)
content_index[content_hash].append(subtree)

return structure_index, content_index

def find_matching_subtrees(index):
matches = []
for hash_key, subtrees in index.items():
if len(subtrees) > 1:
# Generate pairs of matched subtrees
for i in range(len(subtrees)):
for j in range(i + 1, len(subtrees)):
matches.append((subtrees[i], subtrees[j]))
return matches

def print_subtree_details(subtree):
""" A helper function to print subtree details for comparison. """
nodes = []
subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
return " | ".join(nodes)

def print_matches_side_by_side(matches):
for match_pair in matches:
subtree1, subtree2 = match_pair
subtree1_details = print_subtree_details(subtree1)
subtree2_details = print_subtree_details(subtree2)
print("Match Pair:")
print("Subtree 1:", subtree1_details)
print("Subtree 2:", subtree2_details)
print("\n" + "-"*100 + "\n")

# *********************************************************************************************************************
# Usage example:
# *********************************************************************************************************************

loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
document = loader.load()
html_content = document[0].page_content

curr_time = time.time()
# Instantiate a DOMTree with HTML content
dom_tree = DOMTree(html_content)
# nodes, metadatas = dom_tree.collect_text_nodes() # Collect text nodes for analysis
# for node, metadata in zip(nodes, metadatas):
# print("Text:", node)
# print("Metadata:", metadata)

# sub_list = dom_tree.generate_subtree_dicts() # Generate subtree dictionaries for analysis
# print(sub_list)
# graph = dom_tree.visualize(exclude_tags=['script', 'style', 'meta', 'link'])
subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
print("Number of subtrees found:", len(subtrees))

# remove trees whos root node does not lead to any text
text_subtrees = [subtree for subtree in subtrees if subtree.root.leads_to_text]
print("Number of subtrees that lead to text:", len(text_subtrees))

direct_leaf_subtrees = [subtree for subtree in text_subtrees if subtree.root.has_direct_leaves]
print("Number of subtrees with direct leaves beneath fork nodes:", len(direct_leaf_subtrees))

for subtree in direct_leaf_subtrees:
print("Subtree rooted at:", subtree.root.value)
subtree.traverse(lambda node: print(node))
# Index subtrees by structure and content
# structure_index, content_index = index_subtrees(subtrees)

# # Find matches based on structure
# structure_matches = find_matching_subtrees(structure_index)
# print("Structure-based matches found:", len(structure_matches))

# # Print structure-based matches side by side
# print_matches_side_by_side(structure_matches)

# # Optionally, do the same for content-based matches if needed
# content_matches = find_matching_subtrees(content_index)
# print("Content-based matches found:", len(content_matches))
# print_matches_side_by_side(content_matches)

print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")

# Optionally, traverse each subtree
# for subtree in subtrees:
# print("Subtree rooted at:", subtree.root.value)
# subtree.traverse(lambda node: print(node))
# Traverse the DOMTree and print each node
# dom_tree.traverse(lambda node: print(node))
34 changes: 34 additions & 0 deletions examples/faiss_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import AsyncHtmlLoader
import time
from scrapegraphai.asdt import DOMTree
from dotenv import load_dotenv
import os

load_dotenv()
openai_key = os.getenv("OPENAI_APIKEY")
embeddings = OpenAIEmbeddings(api_key=openai_key)

loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
document = loader.load()
html_content = document[0].page_content

curr_time = time.time()
# Instantiate a DOMTree with HTML content
dom_tree = DOMTree(html_content)
text_nodes, metadata = dom_tree.collect_text_nodes() # Collect text nodes for analysis

print(f"Time taken to collect text nodes: {time.time() - curr_time}")

db_texts = FAISS.from_texts(
texts=text_nodes,
embedding=embeddings,
metadatas=metadata
)

# Query for similar text
query = "List me all the projects"

Loading
Loading