Skip to content

Refactoring pdf scraper and json scrape #323

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions examples/anthropic/csv_scraper_graph_multi_haiku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents
"""

import os
from dotenv import load_dotenv
import pandas as pd
from scrapegraphai.graphs import CSVScraperMultiGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

load_dotenv()
# ************************************************
# Read the CSV file
# ************************************************

FILE_NAME = "inputs/username.csv"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

text = pd.read_csv(file_path)

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"api_key": os.getenv("ANTHROPIC_API_KEY"),
"model": "claude-3-haiku-20240307",
"max_tokens": 4000},
}

# ************************************************
# Create the CSVScraperMultiGraph instance and run it
# ************************************************

csv_scraper_graph = CSVScraperMultiGraph(
prompt="List me all the last names",
source=[str(text), str(text)],
config=graph_config
)

result = csv_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
36 changes: 36 additions & 0 deletions examples/anthropic/json_scraper_multi_haiku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Module for showing how JSONScraperMultiGraph multi works
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import JSONScraperMultiGraph

load_dotenv()

graph_config = {
"llm": {
"api_key": os.getenv("ANTHROPIC_API_KEY"),
"model": "claude-3-haiku-20240307",
"max_tokens": 4000
},
}

FILE_NAME = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()

sources = [text, text]

multiple_search_graph = JSONScraperMultiGraph(
prompt= "List me all the authors, title and genres of the books",
source= sources,
schema=None,
config=graph_config
)

result = multiple_search_graph.run()
print(json.dumps(result, indent=4))
4 changes: 3 additions & 1 deletion examples/anthropic/pdf_scraper_graph_haiku.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""
Module for showing how PDFScraper multi works
"""
import os, json
from dotenv import load_dotenv
from scrapegraphai.graphs import PDFScraperGraph

load_dotenv()


# ************************************************
# Define the configuration for the graph
# ************************************************
Expand Down
72 changes: 72 additions & 0 deletions examples/anthropic/pdf_scraper_multi_haiku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Module for showing how PDFScraper multi works
"""
import os
import json
from dotenv import load_dotenv
from scrapegraphai.graphs import PdfScraperMultiGraph

load_dotenv()

graph_config = {
"llm": {
"api_key": os.getenv("ANTHROPIC_API_KEY"),
"model": "claude-3-haiku-20240307",
"max_tokens": 4000
},
}

# ***************
# Covert to list
# ***************

sources = [
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
"This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather—the interaction between call center architecture and outdoor weather conditions—in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity – largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.",
]

prompt = """
You are an expert in reviewing academic manuscripts. Please analyze the abstracts provided from an academic journal article to extract and clearly identify the following elements:

Independent Variable (IV): The variable that is manipulated or considered as the primary cause affecting other variables.
Dependent Variable (DV): The variable that is measured or observed, which is expected to change as a result of variations in the Independent Variable.
Exogenous Shock: Identify any external or unexpected events used in the study that serve as a natural experiment or provide a unique setting for observing the effects on the IV and DV.
Response Format: For each abstract, present your response in the following structured format:

Independent Variable (IV):
Dependent Variable (DV):
Exogenous Shock:

Example Queries and Responses:

Query: This paper provides evidence from a natural experiment on the relationship between positive affect and productivity. We link highly detailed administrative data on the behaviors and performance of all telesales workers at a large telecommunications company with survey reports of employee happiness that we collected on a weekly basis. We use variation in worker mood arising from visual exposure to weather the interaction between call center architecture and outdoor weather conditions in order to provide a quasi-experimental test of the effect of happiness on productivity. We find evidence of a positive impact on sales performance, which is driven by changes in labor productivity largely through workers converting more calls into sales, and to a lesser extent by making more calls per hour and adhering more closely to their schedule. We find no evidence in our setting of effects on measures of high-frequency labor supply such as attendance and break-taking.

Response:

Independent Variable (IV): Employee happiness.
Dependent Variable (DV): Overall firm productivity.
Exogenous Shock: Sudden company-wide increase in bonus payments.

Query: The diffusion of social media coincided with a worsening of mental health conditions among adolescents and young adults in the United States, giving rise to speculation that social media might be detrimental to mental health. In this paper, we provide quasi-experimental estimates of the impact of social media on mental health by leveraging a unique natural experiment: the staggered introduction of Facebook across U.S. colleges. Our analysis couples data on student mental health around the years of Facebook's expansion with a generalized difference-in-differences empirical strategy. We find that the roll-out of Facebook at a college increased symptoms of poor mental health, especially depression. We also find that, among students predicted to be most susceptible to mental illness, the introduction of Facebook led to increased utilization of mental healthcare services. Lastly, we find that, after the introduction of Facebook, students were more likely to report experiencing impairments to academic performance resulting from poor mental health. Additional evidence on mechanisms suggests that the results are due to Facebook fostering unfavorable social comparisons.

Response:

Independent Variable (IV): Exposure to social media.
Dependent Variable (DV): Mental health outcomes.
Exogenous Shock: staggered introduction of Facebook across U.S. colleges.
"""
# *******************************************************
# Create the SmartScraperMultiGraph instance and run it
# *******************************************************

multiple_search_graph = PdfScraperMultiGraph(
prompt=prompt,
source= sources,
schema=None,
config=graph_config
)

result = multiple_search_graph.run()
print(json.dumps(result, indent=4))
55 changes: 55 additions & 0 deletions examples/anthropic/xml_scraper_graph_multi_haiku.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents
"""

import os
from dotenv import load_dotenv
from scrapegraphai.graphs import XMLScraperMultiGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()

# ************************************************
# Read the XML file
# ************************************************

FILE_NAME = "inputs/books.xml"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"api_key": os.getenv("ANTHROPIC_API_KEY"),
"model": "claude-3-haiku-20240307",
"max_tokens": 4000},
}

# ************************************************
# Create the XMLScraperMultiGraph instance and run it
# ************************************************

xml_scraper_graph = XMLScraperMultiGraph(
prompt="List me all the authors, title and genres of the books",
source=[text, text], # Pass the content of the file, not the file object
config=graph_config
)

result = xml_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = xml_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
59 changes: 59 additions & 0 deletions examples/bedrock/csv_scraper_graph_multi_bedrock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Basic example of scraping pipeline using CSVScraperMultiGraph from CSV documents
"""

import os
from dotenv import load_dotenv
import pandas as pd
from scrapegraphai.graphs import CSVScraperMultiGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

load_dotenv()
# ************************************************
# Read the CSV file
# ************************************************

FILE_NAME = "inputs/username.csv"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

text = pd.read_csv(file_path)

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"client": "client_name",
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}

# ************************************************
# Create the CSVScraperMultiGraph instance and run it
# ************************************************

csv_scraper_graph = CSVScraperMultiGraph(
prompt="List me all the last names",
source=[str(text), str(text)],
config=graph_config
)

result = csv_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
59 changes: 59 additions & 0 deletions examples/bedrock/xml_scraper_graph_multi_bedrock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Basic example of scraping pipeline using XMLScraperMultiGraph from XML documents
"""

import os
from dotenv import load_dotenv
from scrapegraphai.graphs import XMLScraperMultiGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()

# ************************************************
# Read the XML file
# ************************************************

FILE_NAME = "inputs/books.xml"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()

# ************************************************
# Define the configuration for the graph
# ************************************************

graph_config = {
"llm": {
"client": "client_name",
"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
"temperature": 0.0
},
"embeddings": {
"model": "bedrock/cohere.embed-multilingual-v3"
}
}

# ************************************************
# Create the XMLScraperMultiGraph instance and run it
# ************************************************

xml_scraper_graph = XMLScraperMultiGraph(
prompt="List me all the authors, title and genres of the books",
source=[text, text], # Pass the content of the file, not the file object
config=graph_config
)

result = xml_scraper_graph.run()
print(result)

# ************************************************
# Get graph execution info
# ************************************************

graph_exec_info = xml_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")
Loading
Loading