fix(pdf): correctly read .pdf files

PeriniM · PeriniM · commit 203de834051e · 2024-06-14T15:20:30.000+02:00
diff --git a/examples/openai/omni_search_openai.py b/examples/openai/omni_search_openai.py
diff --git a/examples/openai/pdf_scraper_openai.py b/examples/openai/pdf_scraper_openai.py
@@ -32,7 +32,7 @@
 
 pdf_scraper_graph = PDFScraperGraph(
     prompt="Summarize the text and find the main topics",
-    source="a.pdf",
+    source="Laureaconanniaccademici.pdf",
     config=graph_config,
 )
 result = pdf_scraper_graph.run()
diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -95,8 +95,10 @@ def execute(self, state):
             
             state.update({self.output[0]: compressed_document})
             return state
-        # handling for pdf
+        # handling pdf
         elif input_keys[0] == "pdf":
+            
+            # TODO: fix bytes content issue
             loader = PyPDFLoader(source)
             compressed_document = loader.load()
             state.update({self.output[0]: compressed_document})
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -5,6 +5,7 @@
 from typing import List, Optional
 from semchunk import chunk
 from langchain_community.document_transformers import Html2TextTransformer
+from langchain_core.documents import Document
 from ..utils.logging import get_logger
 from .base_node import BaseNode
 
@@ -79,10 +80,17 @@ def execute(self, state: dict) -> dict:
         else:
             docs_transformed = docs_transformed[0]
 
-            chunks = chunk(text=docs_transformed,
+            if type(docs_transformed) == Document:
+                chunks = chunk(text=docs_transformed.page_content,
                             chunk_size= self.node_config.get("chunk_size", 4096),
                             token_counter=lambda x: len(x.split()),
                             memoize=False)
+            else:
+                
+                chunks = chunk(text=docs_transformed,
+                                chunk_size= self.node_config.get("chunk_size", 4096),
+                                token_counter=lambda x: len(x.split()),
+                                memoize=False)
                           
         state.update({self.output[0]: chunks})
 

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`
`33`	`33`	`pdf_scraper_graph = PDFScraperGraph(`
`34`	`34`	`prompt="Summarize the text and find the main topics",`
`35`		`- source="a.pdf",`
	`35`	`+ source="Laureaconanniaccademici.pdf",`
`36`	`36`	`config=graph_config,`
`37`	`37`	`)`
`38`	`38`	`result = pdf_scraper_graph.run()`