Skip to content

Commit 91c5b5a

Browse files
committed
fix(multi): updated multi pdf scraper with schema
1 parent 1705046 commit 91c5b5a

File tree

5 files changed

+36
-37
lines changed

5 files changed

+36
-37
lines changed

examples/openai/pdf_scraper_graph_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
pdf_scraper_graph = PDFScraperGraph(
3434
prompt="Summarize the text and find the main topics",
35-
source=source,
35+
source="a.pdf",
3636
config=graph_config,
3737
)
3838
result = pdf_scraper_graph.run()

examples/openai/pdf_scraper_multi_openai.py

Lines changed: 28 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,63 +6,58 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import PdfScraperMultiGraph
88

9+
from pydantic import BaseModel, Field
10+
from typing import List
11+
912
load_dotenv()
1013

1114
openai_key = os.getenv("OPENAI_APIKEY")
1215

16+
# ************************************************
17+
# Define the configuration for the graph
18+
# ************************************************
19+
1320
graph_config = {
1421
"llm": {
1522
"api_key": openai_key,
1623
"model": "gpt-3.5-turbo",
1724
},
25+
"verbose": True,
1826
}
1927

20-
# Covert to list
21-
sources = [
22-
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
23-
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
24-
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
25-
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
26-
]
27-
28-
prompt = """
29-
You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements:
30-
31-
Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables.
32-
Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable.
33-
Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV.
34-
Response Format: For each abstract, present your response in the following structured format:
28+
# ************************************************
29+
# Define the output schema for the graph
30+
# ************************************************
3531

36-
Independent Variable (IV):
37-
Dependent Variable (DV):
38-
Exogenous Shock:
32+
class Article(BaseModel):
33+
independent_variable: str = Field(description="(IV): The variable that is manipulated or considered as the primary cause affecting other variables.")
34+
dependent_variable: str = Field(description="(DV) The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable.")
35+
exogenous_shock: str = Field(description="Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV.")
3936

40-
Example Queries and Responses:
37+
class Articles(BaseModel):
38+
articles: List[Article]
4139

42-
Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.
40+
# ************************************************
41+
# Define the sources for the graph
42+
# ************************************************
4343

44-
Response:
45-
46-
Independent Variable (IV): Employee happiness.
47-
Dependent Variable (DV): Overall firm productivity.
48-
Exogenous Shock: Sudden company-wide increase in bonus payments.
49-
50-
Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.
51-
52-
Response:
44+
sources = [
45+
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
46+
"The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons."
47+
]
5348

54-
Independent Variable (IV): Exposure to social media.
55-
Dependent Variable (DV): Mental health outcomes.
56-
Exogenous Shock: staggered introduction of Facebook across U.S. colleges.
49+
prompt = """
50+
Analyze the abstracts provided from an academic journal article to extract and clearly identify the Independent Variable (IV), Dependent Variable (DV), and Exogenous Shock.
5751
"""
52+
5853
# *******************************************************
5954
# Create the SmartScraperMultiGraph instance and run it
6055
# *******************************************************
6156

6257
multiple_search_graph = PdfScraperMultiGraph(
6358
prompt=prompt,
6459
source= sources,
65-
schema=None,
60+
schema=Articles,
6661
config=graph_config
6762
)
6863

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,4 @@ def run(self) -> str:
119119
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
120120
self.final_state, self.execution_info = self.graph.execute(inputs)
121121

122-
return self.final_state.get("answer", "No answer found.")[0]
122+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/graphs/pdf_scraper_multi.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from copy import copy, deepcopy
66
from typing import List, Optional
7+
from pydantic import BaseModel
78

89
from .base_graph import BaseGraph
910
from .abstract_graph import AbstractGraph
@@ -43,7 +44,7 @@ class PdfScraperMultiGraph(AbstractGraph):
4344
>>> result = search_graph.run()
4445
"""
4546

46-
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
47+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[BaseModel] = None):
4748

4849
self.max_results = config.get("max_results", 3)
4950

@@ -52,6 +53,8 @@ def __init__(self, prompt: str, source: List[str], config: dict, schema: Optiona
5253
else:
5354
self.copy_config = deepcopy(config)
5455

56+
self.copy_schema = deepcopy(schema)
57+
5558
super().__init__(prompt, config, source, schema)
5659

5760
def _create_graph(self) -> BaseGraph:
@@ -70,6 +73,7 @@ def _create_graph(self) -> BaseGraph:
7073
prompt="",
7174
source="",
7275
config=self.copy_config,
76+
schema=self.copy_schema
7377
)
7478

7579
# ************************************************

scrapegraphai/nodes/generate_answer_pdf_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,5 +150,5 @@ def execute(self, state):
150150
answer = merge_chain.invoke({"context": answer, "question": user_prompt})
151151

152152
# Update the state with the generated answer
153-
state.update({self.output[0]: answer.get("Response", {})})
153+
state.update({self.output[0]: answer})
154154
return state

0 commit comments

Comments
 (0)