Skip to content

Commit a1d580c

Browse files
authored
Merge pull request #195 from shorthills-ai/pre/beta
2 parents 590aab7 + 905b345 commit a1d580c

File tree

3 files changed

+14
-11
lines changed

3 files changed

+14
-11
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ playwright==1.43.0
1818
langchain-aws==0.1.2
1919
langchain-anthropic==0.1.11
2020
yahoo-search-py==0.3
21+
pypdf==4.2.0

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,36 +56,29 @@ def _create_graph(self) -> BaseGraph:
5656
"""
5757

5858
fetch_node = FetchNode(
59-
input="pdf_dir",
59+
input='pdf',
6060
output=["doc"],
61-
node_config={
62-
"headless": self.headless,
63-
"verbose": self.verbose
64-
}
6561
)
6662
parse_node = ParseNode(
6763
input="doc",
6864
output=["parsed_doc"],
6965
node_config={
7066
"chunk_size": self.model_token,
71-
"verbose": self.verbose
7267
}
7368
)
7469
rag_node = RAGNode(
7570
input="user_prompt & (parsed_doc | doc)",
7671
output=["relevant_chunks"],
7772
node_config={
78-
"llm": self.llm_model,
73+
"llm_model": self.llm_model,
7974
"embedder_model": self.embedder_model,
80-
"verbose": self.verbose
8175
}
8276
)
8377
generate_answer_node = GenerateAnswerNode(
8478
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
8579
output=["answer"],
8680
node_config={
87-
"llm": self.llm_model,
88-
"verbose": self.verbose
81+
"llm_model": self.llm_model,
8982
}
9083
)
9184

scrapegraphai/nodes/fetch_node.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import List, Optional
66
from langchain_community.document_loaders import AsyncChromiumLoader
77
from langchain_core.documents import Document
8+
from langchain_community.document_loaders import PyPDFLoader
89
from .base_node import BaseNode
910
from ..utils.remover import remover
1011

@@ -56,7 +57,6 @@ def execute(self, state):
5657

5758
# Interpret input keys based on the provided input expression
5859
input_keys = self.get_input_keys(state)
59-
6060
# Fetching data from the state based on the input keys
6161
input_data = [state[key] for key in input_keys]
6262

@@ -66,6 +66,15 @@ def execute(self, state):
6666
"source": "local_dir"
6767
})]
6868
# if it is a local directory
69+
70+
# handling for pdf
71+
elif self.input == "pdf":
72+
loader = PyPDFLoader(source)
73+
compressed_document = loader.load()
74+
75+
elif self.input == "pdf_dir":
76+
pass
77+
6978
elif not source.startswith("http"):
7079
compressed_document = [Document(page_content=remover(source), metadata={
7180
"source": "local_dir"

0 commit comments

Comments
 (0)