Skip to content

Commit 3c7dedf

Browse files
authored
Merge pull request #305 from VinciGit00/pdf_fix
Pdf fix
2 parents fb74a52 + a796169 commit 3c7dedf

File tree

9 files changed

+139
-66
lines changed

9 files changed

+139
-66
lines changed
Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,18 @@
1-
"""
2-
Basic example of scraping pipeline using PDFScraper
31
"""
4-
5-
import os
6-
from dotenv import load_dotenv
2+
Module for showing how PDFScraper works
3+
"""
74
from scrapegraphai.graphs import PDFScraperGraph
85

9-
load_dotenv()
10-
11-
12-
# ************************************************
13-
# Define the configuration for the graph
14-
# ************************************************
15-
16-
openai_key = os.getenv("OPENAI_APIKEY")
17-
186
graph_config = {
197
"llm": {
20-
"api_key":openai_key,
21-
"model": "gpt-3.5-turbo",
8+
"model": "ollama/llama3",
9+
"temperature": 0,
10+
"format": "json", # Ollama needs the format to be specified explicitly
11+
"model_tokens": 4000,
12+
},
13+
"embeddings": {
14+
"model": "ollama/nomic-embed-text",
15+
"temperature": 0,
2216
},
2317
"verbose": True,
2418
"headless": False,
@@ -27,8 +21,6 @@
2721
# Covert to list
2822
sources = [
2923
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
30-
"The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.",
31-
"Hollywood films are generally released first in the United States and then later abroad, with some variation in lags across films and countries. With the growth in movie piracy since the appearance of BitTorrent in 2003, films have become available through illegal piracy immediately after release in the US, while they are not available for legal viewing abroad until their foreign premieres in each country. We make use of this variation in international release lags to ask whether longer lags – which facilitate more local pre-release piracy – depress theatrical box office receipts, particularly after the widespread adoption of BitTorrent. We find that longer release windows are associated with decreased box office returns, even after controlling for film and country fixed effects. This relationship is much stronger in contexts where piracy is more prevalent: after BitTorrent’s adoption and in heavily-pirated genres. Our findings indicate that, as a lower bound, international box office returns in our sample were at least 7% lower than they would have been in the absence of pre-release piracy. By contrast, we do not see evidence of elevated sales displacement in US box office revenue following the adoption of BitTorrent, and we suggest that delayed legal availability of the content abroad may drive the losses to piracy."
3224
# Add more sources here
3325
]
3426

@@ -62,13 +54,14 @@
6254
Dependent Variable (DV): Mental health outcomes.
6355
Exogenous Shock: staggered introduction of Facebook across U.S. colleges.
6456
"""
65-
66-
pdf_scraper_graph = PDFScraperGraph(
67-
prompt=prompt,
68-
source=sources[0],
69-
config=graph_config
70-
)
71-
result = pdf_scraper_graph.run()
72-
73-
74-
print(result)
57+
results = []
58+
for source in sources:
59+
pdf_scraper_graph = PDFScraperGraph(
60+
prompt=prompt,
61+
source=source,
62+
config=graph_config
63+
)
64+
result = pdf_scraper_graph.run()
65+
results.append(result)
66+
67+
print(results)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os, json
2+
from dotenv import load_dotenv
3+
from scrapegraphai.graphs import PDFScraperGraph
4+
5+
load_dotenv()
6+
7+
8+
# ************************************************
9+
# Define the configuration for the graph
10+
# ************************************************
11+
12+
openai_key = os.getenv("OPENAI_APIKEY")
13+
14+
graph_config = {
15+
"llm": {
16+
"api_key": openai_key,
17+
"model": "gpt-3.5-turbo",
18+
},
19+
"verbose": True,
20+
"headless": False,
21+
}
22+
23+
source = """
24+
The Divine Comedy, Italian La Divina Commedia, original name La commedia, long narrative poem written in Italian
25+
circa 1308/21 by Dante. It is usually held to be one of the world s great works of literature.
26+
Divided into three major sections—Inferno, Purgatorio, and Paradiso—the narrative traces the journey of Dante
27+
from darkness and error to the revelation of the divine light, culminating in the Beatific Vision of God.
28+
Dante is guided by the Roman poet Virgil, who represents the epitome of human knowledge, from the dark wood
29+
through the descending circles of the pit of Hell (Inferno). He then climbs the mountain of Purgatory, guided
30+
by the Roman poet Statius, who represents the fulfilment of human knowledge, and is finally led by his lifelong love,
31+
the Beatrice of his earlier poetry, through the celestial spheres of Paradise.
32+
"""
33+
34+
schema = """
35+
{
36+
"type": "object",
37+
"properties": {
38+
"summary": {
39+
"type": "string"
40+
},
41+
"topics": {
42+
"type": "array",
43+
"items": {
44+
"type": "string"
45+
}
46+
}
47+
}
48+
}
49+
"""
50+
51+
pdf_scraper_graph = PDFScraperGraph(
52+
prompt="Summarize the text and find the main topics",
53+
source=source,
54+
config=graph_config,
55+
schema=schema,
56+
)
57+
result = pdf_scraper_graph.run()
58+
59+
print(json.dumps(result, indent=4))

requirements-dev.lock

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ botocore==1.34.113
4848
# via boto3
4949
# via s3transfer
5050
burr==0.19.1
51-
# via burr
5251
# via scrapegraphai
5352
cachetools==5.3.3
5453
# via google-auth
@@ -64,13 +63,6 @@ click==8.1.7
6463
# via streamlit
6564
# via typer
6665
# via uvicorn
67-
colorama==0.4.6
68-
# via click
69-
# via loguru
70-
# via pytest
71-
# via sphinx
72-
# via tqdm
73-
# via uvicorn
7466
contourpy==1.2.1
7567
# via matplotlib
7668
cycler==0.12.1
@@ -144,7 +136,6 @@ graphviz==0.20.3
144136
# via scrapegraphai
145137
greenlet==3.0.3
146138
# via playwright
147-
# via sqlalchemy
148139
groq==0.8.0
149140
# via langchain-groq
150141
grpcio==1.64.0
@@ -475,19 +466,17 @@ undetected-playwright==0.3.0
475466
# via scrapegraphai
476467
uritemplate==4.1.1
477468
# via google-api-python-client
478-
urllib3==2.2.1
469+
urllib3==1.26.18
479470
# via botocore
480471
# via requests
481472
uvicorn==0.29.0
482473
# via burr
483474
# via fastapi
484-
watchdog==4.0.1
485-
# via streamlit
475+
uvloop==0.19.0
476+
# via uvicorn
486477
watchfiles==0.21.0
487478
# via uvicorn
488479
websockets==12.0
489480
# via uvicorn
490-
win32-setctime==1.1.0
491-
# via loguru
492481
yarl==1.9.4
493482
# via aiohttp

requirements.lock

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@ certifi==2024.2.2
4040
# via requests
4141
charset-normalizer==3.3.2
4242
# via requests
43-
colorama==0.4.6
44-
# via tqdm
4543
dataclasses-json==0.6.6
4644
# via langchain
4745
# via langchain-community
@@ -89,7 +87,6 @@ graphviz==0.20.3
8987
# via scrapegraphai
9088
greenlet==3.0.3
9189
# via playwright
92-
# via sqlalchemy
9390
groq==0.8.0
9491
# via langchain-groq
9592
grpcio==1.64.0
@@ -287,7 +284,7 @@ undetected-playwright==0.3.0
287284
# via scrapegraphai
288285
uritemplate==4.1.1
289286
# via google-api-python-client
290-
urllib3==2.2.1
287+
urllib3==1.26.18
291288
# via botocore
292289
# via requests
293290
yarl==1.9.4

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class PDFScraperGraph(AbstractGraph):
4747
"""
4848

4949
def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
50-
super().__init__(prompt, config, source)
50+
super().__init__(prompt, config, source, schema)
5151

5252
self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
5353

@@ -76,6 +76,7 @@ def _create_graph(self) -> BaseGraph:
7676
output=["answer"],
7777
node_config={
7878
"llm_model": self.llm_model,
79+
"schema": self.schema
7980
}
8081
)
8182

scrapegraphai/helpers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@
88
from .robots import robots_dictionary
99
from .generate_answer_node_prompts import template_chunks, template_chunks_with_schema, template_no_chunks, template_no_chunks_with_schema, template_merge
1010
from .generate_answer_node_csv_prompts import template_chunks_csv, template_no_chunks_csv, template_merge_csv
11-
from .generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf
11+
from .generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf, template_chunks_pdf_with_schema, template_no_chunks_pdf_with_schema
1212
from .generate_answer_node_omni_prompts import template_chunks_omni, template_no_chunk_omni, template_merge_omni

scrapegraphai/helpers/generate_answer_node_pdf_prompts.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@
1313
Content of {chunk_id}: {context}. \n
1414
"""
1515

16+
template_chunks_pdf_with_schema = """
17+
You are a PDF scraper and you have just scraped the
18+
following content from a PDF.
19+
You are now asked to answer a user question about the content you have scraped.\n
20+
The PDF is big so I am giving you one chunk at the time to be merged later with the other chunks.\n
21+
Ignore all the context sentences that ask you not to extract information from the html code.\n
22+
If you don't find the answer put as value "NA".\n
23+
Make sure the output json is formatted correctly and does not contain errors. \n
24+
The schema as output is the following: {schema}\n
25+
Output instructions: {format_instructions}\n
26+
Content of {chunk_id}: {context}. \n
27+
"""
28+
1629
template_no_chunks_pdf = """
1730
You are a PDF scraper and you have just scraped the
1831
following content from a PDF.
@@ -25,6 +38,19 @@
2538
PDF content: {context}\n
2639
"""
2740

41+
template_no_chunks_pdf_with_schema = """
42+
You are a PDF scraper and you have just scraped the
43+
following content from a PDF.
44+
You are now asked to answer a user question about the content you have scraped.\n
45+
Ignore all the context sentences that ask you not to extract information from the html code.\n
46+
If you don't find the answer put as value "NA".\n
47+
Make sure the output json is formatted correctly and does not contain errors. \n
48+
The schema as output is the following: {schema}\n
49+
Output instructions: {format_instructions}\n
50+
User question: {question}\n
51+
PDF content: {context}\n
52+
"""
53+
2854
template_merge_pdf = """
2955
You are a PDF scraper and you have just scraped the
3056
following content from a PDF.

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,28 +82,36 @@ def execute(self, state: dict) -> dict:
8282
chains_dict = {}
8383

8484
# Use tqdm to add progress bar
85-
for i, chunk in enumerate(
86-
tqdm(doc, desc="Processing chunks", disable=not self.verbose)
87-
):
88-
if len(doc) == 1:
85+
for i, chunk in enumerate(tqdm(doc, desc="Processing chunks", disable=not self.verbose)):
86+
if self.node_config["schema"] is None and len(doc) == 1:
8987
prompt = PromptTemplate(
9088
template=template_no_chunks,
9189
input_variables=["question"],
92-
partial_variables={
93-
"context": chunk.page_content,
94-
"format_instructions": format_instructions,
95-
},
96-
)
97-
else:
90+
partial_variables={"context": chunk.page_content,
91+
"format_instructions": format_instructions})
92+
elif self.node_config["schema"] is not None and len(doc) == 1:
93+
prompt = PromptTemplate(
94+
template=template_no_chunks_with_schema,
95+
input_variables=["question"],
96+
partial_variables={"context": chunk.page_content,
97+
"format_instructions": format_instructions,
98+
"schema": self.node_config["schema"]
99+
})
100+
elif self.node_config["schema"] is None and len(doc) > 1:
98101
prompt = PromptTemplate(
99102
template=template_chunks,
100103
input_variables=["question"],
101-
partial_variables={
102-
"context": chunk.page_content,
103-
"chunk_id": i + 1,
104-
"format_instructions": format_instructions,
105-
},
106-
)
104+
partial_variables={"context": chunk.page_content,
105+
"chunk_id": i + 1,
106+
"format_instructions": format_instructions})
107+
elif self.node_config["schema"] is not None and len(doc) > 1:
108+
prompt = PromptTemplate(
109+
template=template_chunks_with_schema,
110+
input_variables=["question"],
111+
partial_variables={"context": chunk.page_content,
112+
"chunk_id": i + 1,
113+
"format_instructions": format_instructions,
114+
"schema": self.node_config["schema"]})
107115

108116
# Dynamically name the chains based on their index
109117
chain_name = f"chunk{i+1}"

scrapegraphai/nodes/generate_answer_pdf_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
# Imports from the library
1717
from .base_node import BaseNode
18-
from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf
18+
from ..helpers.generate_answer_node_pdf_prompts import template_chunks_pdf, template_no_chunks_pdf, template_merge_pdf, template_chunks_pdf_with_schema, template_no_chunks_pdf_with_schema
1919

2020

2121
class GenerateAnswerPDFNode(BaseNode):
@@ -57,7 +57,7 @@ def __init__(
5757
node_name (str): name of the node
5858
"""
5959
super().__init__(node_name, "node", input, output, 2, node_config)
60-
self.llm_model = node_config["llm"]
60+
self.llm_model = node_config["llm_model"]
6161
self.verbose = (
6262
False if node_config is None else node_config.get("verbose", False)
6363
)

0 commit comments

Comments
 (0)