Skip to content

Commit 203de83

Browse files
committed
fix(pdf): correctly read .pdf files
1 parent 91c5b5a commit 203de83

File tree

4 files changed

+13
-3
lines changed

4 files changed

+13
-3
lines changed

examples/openai/pdf_scraper_graph_openai.py renamed to examples/openai/pdf_scraper_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
pdf_scraper_graph = PDFScraperGraph(
3434
prompt="Summarize the text and find the main topics",
35-
source="a.pdf",
35+
source="Laureaconanniaccademici.pdf",
3636
config=graph_config,
3737
)
3838
result = pdf_scraper_graph.run()

scrapegraphai/nodes/fetch_node.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,10 @@ def execute(self, state):
9595

9696
state.update({self.output[0]: compressed_document})
9797
return state
98-
# handling for pdf
98+
# handling pdf
9999
elif input_keys[0] == "pdf":
100+
101+
# TODO: fix bytes content issue
100102
loader = PyPDFLoader(source)
101103
compressed_document = loader.load()
102104
state.update({self.output[0]: compressed_document})

scrapegraphai/nodes/parse_node.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import List, Optional
66
from semchunk import chunk
77
from langchain_community.document_transformers import Html2TextTransformer
8+
from langchain_core.documents import Document
89
from ..utils.logging import get_logger
910
from .base_node import BaseNode
1011

@@ -79,10 +80,17 @@ def execute(self, state: dict) -> dict:
7980
else:
8081
docs_transformed = docs_transformed[0]
8182

82-
chunks = chunk(text=docs_transformed,
83+
if type(docs_transformed) == Document:
84+
chunks = chunk(text=docs_transformed.page_content,
8385
chunk_size= self.node_config.get("chunk_size", 4096),
8486
token_counter=lambda x: len(x.split()),
8587
memoize=False)
88+
else:
89+
90+
chunks = chunk(text=docs_transformed,
91+
chunk_size= self.node_config.get("chunk_size", 4096),
92+
token_counter=lambda x: len(x.split()),
93+
memoize=False)
8694

8795
state.update({self.output[0]: chunks})
8896

0 commit comments

Comments
 (0)