Skip to content

Commit be16fec

Browse files
committed
WIP
1 parent 7ae50c0 commit be16fec

File tree

6 files changed

+206
-0
lines changed

6 files changed

+206
-0
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ langchain-aws==0.1.2
1919
langchain-anthropic==0.1.11
2020
yahoo-search-py==0.3
2121
pypdf==4.2.0
22+
burr[start]
32.2 KB
Loading
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
digraph {
2+
graph [compound=false concentrate=false rankdir=TB ranksep=0.4]
3+
fetch_node [label=fetch_node shape=box style=rounded]
4+
parse_node [label=parse_node shape=box style=rounded]
5+
input__chunk_size [label="input: chunk_size" shape=oval style=dashed]
6+
input__chunk_size -> parse_node
7+
rag_node [label=rag_node shape=box style=rounded]
8+
input__llm_model [label="input: llm_model" shape=oval style=dashed]
9+
input__llm_model -> rag_node
10+
input__embedder_model [label="input: embedder_model" shape=oval style=dashed]
11+
input__embedder_model -> rag_node
12+
generate_answer_node [label=generate_answer_node shape=box style=rounded]
13+
input__llm_model [label="input: llm_model" shape=oval style=dashed]
14+
input__llm_model -> generate_answer_node
15+
fetch_node -> parse_node [style=solid]
16+
parse_node -> rag_node [style=solid]
17+
rag_node -> generate_answer_node [style=solid]
18+
}
34 KB
Loading
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
SmartScraperGraph Module Burr Version
3+
"""
4+
from typing import Tuple
5+
6+
from burr import tracking
7+
from burr.core import Application, ApplicationBuilder, State, default, when
8+
from burr.core.action import action
9+
10+
from langchain_community.document_loaders import AsyncChromiumLoader
11+
from langchain_core.documents import Document
12+
from ..utils.remover import remover
13+
14+
15+
@action(reads=["url", "local_dir"], writes=["doc"])
16+
def fetch_node(state: State, headless: bool = True, verbose: bool = False) -> tuple[dict, State]:
17+
if verbose:
18+
print(f"--- Executing Fetch Node ---")
19+
20+
source = state.get("url", state.get("local_dir"))
21+
22+
if self.input == "json_dir" or self.input == "xml_dir" or self.input == "csv_dir":
23+
compressed_document = [Document(page_content=source, metadata={
24+
"source": "local_dir"
25+
})]
26+
# if it is a local directory
27+
elif not source.startswith("http"):
28+
compressed_document = [Document(page_content=remover(source), metadata={
29+
"source": "local_dir"
30+
})]
31+
32+
else:
33+
if self.node_config is not None and self.node_config.get("endpoint") is not None:
34+
35+
loader = AsyncChromiumLoader(
36+
[source],
37+
proxies={"http": self.node_config["endpoint"]},
38+
headless=headless,
39+
)
40+
else:
41+
loader = AsyncChromiumLoader(
42+
[source],
43+
headless=headless,
44+
)
45+
46+
document = loader.load()
47+
compressed_document = [
48+
Document(page_content=remover(str(document[0].page_content)))]
49+
50+
return {"doc": compressed_document}, state.update(doc=compressed_document)
51+
52+
@action(reads=["doc"], writes=["parsed_doc"])
53+
def parse_node(state: State, chunk_size: int) -> tuple[dict, State]:
54+
return {}, state
55+
56+
@action(reads=["user_prompt", "parsed_doc", "doc"],
57+
writes=["relevant_chunks"])
58+
def rag_node(state: State, llm_model: object, embedder_model: object) -> tuple[dict, State]:
59+
return {}, state
60+
61+
@action(reads=["user_prompt", "relevant_chunks", "parsed_doc", "doc"],
62+
writes=["answer"])
63+
def generate_answer_node(state: State, llm_model: object) -> tuple[dict, State]:
64+
return {}, state
65+
66+
def run(prompt: str, input_key: str, source: str, config: dict) -> str:
67+
68+
llm_model = config["llm_model"]
69+
embedder_model = config["embedder_model"]
70+
chunk_size = config["model_token"]
71+
72+
initial_state = {
73+
"user_prompt": prompt,
74+
input_key: source
75+
}
76+
app = (
77+
ApplicationBuilder()
78+
.with_actions(
79+
fetch_node=fetch_node,
80+
parse_node=parse_node,
81+
rag_node=rag_node,
82+
generate_answer_node=generate_answer_node
83+
)
84+
.with_transitions(
85+
("fetch_node", "parse_node", default),
86+
("parse_node", "rag_node", default),
87+
("rag_node", "generate_answer_node", default)
88+
)
89+
.with_entrypoint("fetch_node")
90+
.with_state(**initial_state)
91+
.build()
92+
)
93+
app.visualize(
94+
output_file_path="smart_scraper_graph",
95+
include_conditions=False, view=True, format="png"
96+
)
97+
# last_action, result, state = app.run(
98+
# halt_after=["generate_answer_node"],
99+
# inputs={
100+
# "llm_model": llm_model,
101+
# "embedder_model": embedder_model,
102+
# "model_token": chunk_size
103+
# }
104+
# )
105+
# return result.get("answer", "No answer found.")
106+
107+
if __name__ == '__main__':
108+
109+
prompt = "What is the capital of France?"
110+
source = "https://en.wikipedia.org/wiki/Paris"
111+
input_key = "url"
112+
config = {
113+
"llm_model": "rag-token",
114+
"embedder_model": "foo",
115+
"model_token": "bar",
116+
}
117+
run(prompt, input_key, source, config)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
SmartScraperGraph Module Burr Version
3+
"""
4+
5+
from typing import Tuple
6+
7+
from burr import tracking
8+
from burr.core import Application, ApplicationBuilder, State, default, when
9+
from burr.core.action import action
10+
11+
from langchain_community.document_loaders import AsyncChromiumLoader
12+
from langchain_core.documents import Document
13+
if __name__ == '__main__':
14+
from scrapegraphai.utils.remover import remover
15+
else:
16+
from ..utils.remover import remover
17+
18+
19+
def fetch_node(source: str,
20+
headless: bool = True
21+
) -> Document:
22+
if not source.startswith("http"):
23+
return Document(page_content=remover(source), metadata={
24+
"source": "local_dir"
25+
})
26+
else:
27+
loader = AsyncChromiumLoader(
28+
[source],
29+
headless=headless,
30+
)
31+
document = loader.load()
32+
return Document(page_content=remover(str(document[0].page_content)))
33+
34+
def parse_node(fetch_node: Document, chunk_size: int) -> list[Document]:
35+
36+
pass
37+
38+
def rag_node(parse_node: list[Document]) -> list[Document]:
39+
pass
40+
41+
def generate_answer_node(rag_node: list[Document]) -> str:
42+
pass
43+
44+
45+
if __name__ == '__main__':
46+
from hamilton import driver
47+
import __main__ as smart_scraper_graph_hamilton
48+
dr = (
49+
driver.Builder()
50+
.with_modules(smart_scraper_graph_hamilton)
51+
.with_config({})
52+
.build()
53+
)
54+
dr.display_all_functions("smart_scraper.png")
55+
56+
# config = {
57+
# "llm_model": "rag-token",
58+
# "embedder_model": "foo",
59+
# "model_token": "bar",
60+
# }
61+
#
62+
# result = dr.execute(
63+
# ["generate_answer_node"],
64+
# inputs={
65+
# "prompt": "What is the capital of France?",
66+
# "source": "https://en.wikipedia.org/wiki/Paris",
67+
# }
68+
# )
69+
#
70+
# print(result)

0 commit comments

Comments
 (0)