Skip to content

Commit c538852

Browse files
committed
first git
1 parent 5e64214 commit c538852

16 files changed

+240
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

ClassGenerator.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
from dotenv import load_dotenv
3+
from AmazScraper.pydantic_class import Response
4+
from AmazScraper.class_creator import create_class
5+
from langchain_openai import ChatOpenAI
6+
from langchain.prompts import PromptTemplate
7+
from langchain_core.pydantic_v1 import Field
8+
from langchain.output_parsers import PydanticOutputParser
9+
10+
load_dotenv()
11+
12+
MY_ENV_VAR = os.getenv('API_KEY')
13+
14+
class Generator:
15+
def __init__(self, values: list):
16+
create_class(values)
17+
18+
self.parser = PydanticOutputParser(pydantic_object=Response)
19+
20+
self.prompt = PromptTemplate(
21+
template="Answer the user query.\n{format_instructions}\n{query}\n",
22+
input_variables=["query"],
23+
partial_variables={"format_instructions": self.parser.get_format_instructions()},
24+
)
25+
26+
self.model = ChatOpenAI(openai_api_key=MY_ENV_VAR)
27+
28+
self.chain = self.prompt | self.model | self.parser
29+
30+
def invocation(self, query_info):
31+
try:
32+
result = self.chain.invoke({"query": query_info})
33+
print(result)
34+
return result
35+
except Exception as e:
36+
print(f"Error: {e}")

README.md

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# 🤖 AmazScraper
2+
3+
This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code.
4+
5+
The tech stack is fully in Python and the main libraries used are pydantic, langchain and requests.
6+
7+
The use of this library allows to scrape and extract informations from websites in just few seconds instead of write ad-hoc code for each website.
8+
9+
This library can work passing as a parameter from the code the HTML to scrape or it can work passing the
10+
link of the website that you want to extract informations.
11+
12+
# Setup
13+
14+
Follow the following steps:
15+
16+
1. ```bash
17+
git clone https://github.com/VinciGit00/AmazScraper.git
18+
```
19+
2. ```bash
20+
pip install -r requirements.txt
21+
```
22+
3. Go to [https://openai.com](https://openai.com/) and login
23+
4. Now you can access to [https://platform.openai.com/docs/overview](https://platform.openai.com/docs/overview)
24+
5. Create a new API key and copy it
25+
![Screenshot 2024-01-26 alle 17.10.10.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.10.png)
26+
27+
![Screenshot 2024-01-26 alle 17.10.31.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.31.png)
28+
29+
![Screenshot 2024-01-26 alle 17.10.52.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.52.png)
30+
31+
![Screenshot 2024-01-26 alle 17.11.10.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.11.10.png)
32+
33+
6. Open the .env file inside main and paste the API key
34+
35+
```config
36+
API_KEY="your openai.com api key"
37+
```
38+
39+
7. You are ready to go! 🚀
40+
41+
# Practical use
42+
43+
## Using AmazScraper as a library
44+
45+
```python
46+
from AmazScraper.ClassGenerator import Generator
47+
48+
from AmazScraper.getter import get_function, scraper
49+
50+
values = [
51+
{
52+
"title": "title",
53+
"type": "str",
54+
"description": "Title of the items"
55+
}
56+
]
57+
58+
if __name__ == "__main__":
59+
60+
query_info = scraper("https://www.mockupworld.co", 4197)
61+
generator_instance = Generator(values)
62+
63+
res = generator_instance.invocation(query_info)
64+
```
65+
66+
### Case 2: Passing your own HTML code
67+
68+
```python
69+
import sys
70+
from AmazScraper.ClassGenerator import Generator
71+
72+
values = [
73+
{
74+
"title": "title",
75+
"type": "str",
76+
"description": "Title of the news"
77+
}
78+
]
79+
80+
# Example using a HTML code
81+
query_info = '''
82+
Given this code extract all the information in a json format about the news.
83+
<article class="c-card__wrapper aem_card_check_wrapper" data-cardindex="0">
84+
<div class="c-card__content">
85+
<h2 class="c-card__title">Booker show da 52 punti, chi ha più partite oltre quota 50</h2>
86+
<div class="c-card__label-wrapper c-label-wrapper">
87+
<span class="c-label c-label--article-heading">LA CLASSIFICA</span>
88+
</div>
89+
<p class="c-card__abstract">Il n° 1 dei Suns ha dominato la sfida vinta a New Orleans segnando 52 punti. Si tratta della...</p>
90+
<div class="c-card__info">
91+
<time class="c-card__date" datetime="20 gen - 07:54">20 gen - 07:54</time>
92+
<span class="c-card__content-data">
93+
<i class="icon icon--media-outline icon--gallery-outline icon--xxsmall icon--c-neutral">
94+
<svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery-outline">
95+
<path d="M26.174 32.174v31.975h44.588V32.174H26.174zm-3.08-9.238h50.747A6.159 6.159 0 0 1 80 29.095v38.134a6.159 6.159 0 0 1-6.159 6.158H23.095a6.159 6.159 0 0 1-6.159-6.158V29.095a6.159 6.159 0 0 1 6.159-6.159zM9.239 55.665a4.619 4.619 0 0 1-9.238 0V16.777C0 10.825 4.825 6 10.777 6H64.08a4.619 4.619 0 1 1 0 9.238H10.777c-.85 0-1.54.69-1.54 1.54v38.887z" fill="currentColor" fill-rule="evenodd"></path>
96+
</svg>
97+
</i>
98+
28 foto
99+
</span>
100+
</div>
101+
</div>
102+
<div class="c-card__img-wrapper">
103+
<figure class="o-aspect-ratio o-aspect-ratio--16-10 ">
104+
<img crossorigin="anonymous" class="c-card__img j-lazyload" alt="Partite con 50+ punti: Booker in Top-20" data-srcset="..." sizes="..." loading="lazy" data-src="...">
105+
<noscript>
106+
<img crossorigin="anonymous" class="c-card__img" alt="Partite con 50+ punti: Booker in Top-20" srcset="..." sizes="..." src="...">
107+
</noscript>
108+
</figure>
109+
<i class="icon icon--media icon--gallery icon--medium icon--c-primary">
110+
<svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery">
111+
<path d="M17.005 20.221h60.211c1.538 0 2.784 1.28 2.784 2.858v48.317c0 1.578-1.246 2.858-2.784 2.858H17.005c-1.537 0-2.784-1.28-2.784-2.858V23.079c0-1.578 1.247-2.858 2.784-2.858zM5.873 11.873V60.62a2.937 2.937 0 0 1-5.873 0V11.286A5.286 5.286 0 0 1 5.286 6h61.08a2.937 2.937 0 1 1 0 5.873H5.873z"></path>
112+
</svg>
113+
</i>
114+
</div>
115+
</article>
116+
'''
117+
118+
if __name__ == "__main__":
119+
generator_instance = Generator(values)
120+
121+
generator_instance.invocation(query_info)
122+
```
123+
124+
Developed by
125+
126+
![logo-removebg-preview.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/logo-removebg-preview.png)
2.33 KB
Binary file not shown.
996 Bytes
Binary file not shown.

__pycache__/getter.cpython-311.pyc

2.58 KB
Binary file not shown.
642 Bytes
Binary file not shown.

class_creator.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
base_script = '''
2+
from langchain_core.pydantic_v1 import BaseModel, Field
3+
4+
class Response(BaseModel):
5+
'''
6+
7+
# This function creates a class at runtime using the values from the list.
8+
def create_class(data_dict: dict):
9+
for elem in data_dict:
10+
global base_script
11+
base_script = base_script + f" {elem['title']}: {elem['type']} = Field(description='{elem['description']}')\n"
12+
13+
with open("AmazScraper/pydantic_class.py", "w") as f:
14+
f.write(base_script)

getter.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
5+
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
6+
'Accept-Language': 'en-US'}
7+
8+
9+
def get_function(link:str, param = HEADERS) -> str:
10+
"""
11+
It sends a GET request to the specified link with optional headers.
12+
13+
Parameters:
14+
link (str): The URL to send the GET request to.
15+
param (dict): Optional headers to include in the request. Default is HEADERS.
16+
17+
Returns:
18+
str: The content of the response as a string.
19+
"""
20+
response = requests.get(url=link, headers=HEADERS)
21+
return str(response.content)
22+
23+
24+
25+
26+
def scraper(link: str, max_char: int) -> str:
27+
"""
28+
Scrapes the HTML text and removes unwanted elements, text, and comments.
29+
30+
Args:
31+
link (str): The HTML link to be scraped.
32+
max_char (int): The maximum number of characters in the returned HTML body.
33+
34+
Returns:
35+
str: The scraped HTML body as a string without script meta tags and limited to max_char characters.
36+
"""
37+
text = get_function(link)
38+
soup = BeautifulSoup(text, 'html.parser')
39+
40+
unwanted_elements = ['head', 'script', 'style']
41+
unwanted_text = "Per discutere l'accesso automatizzato ai dati di Amazon"
42+
unwanted_comment = "Correios.DoNotSend"
43+
44+
for element in soup(unwanted_elements):
45+
element.decompose()
46+
47+
for unwanted_content in soup.find_all(string=lambda text: unwanted_text in text or unwanted_comment in text):
48+
unwanted_content.extract()
49+
50+
html_body = str(soup.body).replace('\n', '')
51+
52+
# Limit the number of characters in the HTML body
53+
return html_body[:max_char]

pydantic_class.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
from langchain_core.pydantic_v1 import BaseModel, Field
3+
4+
class Response(BaseModel):
5+
title_swebsite: str = Field(description='Title of the items')

requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
beautifulsoup4==4.12.3
2+
langchain==0.1.4
3+
langchain_core==0.1.16
4+
langchain_openai==0.0.5
5+
python-dotenv==1.0.1
6+
Requests==2.31.0
Loading
Loading
Loading
Loading
Loading

0 commit comments

Comments
 (0)