ScrapeGraphAI
diff --git a/‎.DS_Store
6 KB b/‎.DS_Store
6 KB
diff --git a/‎ClassGenerator.py
Lines changed: 36 additions & 0 deletions b/‎ClassGenerator.py
Lines changed: 36 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 126 additions & 0 deletions b/‎README.md
Lines changed: 126 additions & 0 deletions
diff --git a/‎__pycache__/ClassGenerator.cpython-311.pyc
2.33 KB b/‎__pycache__/ClassGenerator.cpython-311.pyc
2.33 KB
diff --git a/‎__pycache__/class_creator.cpython-311.pyc
996 Bytes b/‎__pycache__/class_creator.cpython-311.pyc
996 Bytes
diff --git a/‎__pycache__/getter.cpython-311.pyc
2.58 KB b/‎__pycache__/getter.cpython-311.pyc
2.58 KB
diff --git a/‎__pycache__/pydantic_class.cpython-311.pyc
642 Bytes b/‎__pycache__/pydantic_class.cpython-311.pyc
642 Bytes
diff --git a/‎class_creator.py
Lines changed: 14 additions & 0 deletions b/‎class_creator.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎getter.py
Lines changed: 53 additions & 0 deletions b/‎getter.py
Lines changed: 53 additions & 0 deletions
diff --git a/‎pydantic_class.py
Lines changed: 5 additions & 0 deletions b/‎pydantic_class.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 6 additions & 0 deletions b/‎requirements.txt
Lines changed: 6 additions & 0 deletions
diff --git a/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.10.png
59.1 KB b/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.10.png
59.1 KB
diff --git a/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.31.png
121 KB b/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.31.png
121 KB
diff --git a/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.52.png
59.4 KB b/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.52.png
59.4 KB
diff --git a/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.11.10.png
84.9 KB b/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.11.10.png
84.9 KB
diff --git a/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/logo-removebg-preview.png
129 KB b/‎🤖 AmazScraper 1c39ca53921c406da6a86898af0564ae/logo-removebg-preview.png
129 KB
@@ -0,0 +1,36 @@
+import os
+from dotenv import load_dotenv
+from AmazScraper.pydantic_class import Response
+from AmazScraper.class_creator import create_class
+from langchain_openai import ChatOpenAI
+from langchain.prompts import PromptTemplate
+from langchain_core.pydantic_v1 import Field
+from langchain.output_parsers import PydanticOutputParser
+
+load_dotenv()
+
+MY_ENV_VAR = os.getenv('API_KEY')
+
+class Generator:
+    def __init__(self, values: list):
+        create_class(values)
+
+        self.parser = PydanticOutputParser(pydantic_object=Response)
+
+        self.prompt = PromptTemplate(
+            template="Answer the user query.\n{format_instructions}\n{query}\n",
+            input_variables=["query"],
+            partial_variables={"format_instructions": self.parser.get_format_instructions()},
+        )
+
+        self.model = ChatOpenAI(openai_api_key=MY_ENV_VAR)
+
+        self.chain = self.prompt | self.model | self.parser
+
+    def invocation(self, query_info):
+        try:
+            result = self.chain.invoke({"query": query_info})
+            print(result)
+            return result
+        except Exception as e:
+            print(f"Error: {e}")
@@ -0,0 +1,126 @@
+# 🤖 AmazScraper
+
+This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code.
+
+The tech stack is fully in Python and the main libraries used are pydantic, langchain and requests.
+
+The use of this library allows to scrape and extract informations from websites in just few seconds instead of write ad-hoc code for each website.
+
+This library can work passing as a parameter from the code the HTML to scrape or it can work passing the
+link of the website that you want to extract informations.
+
+# Setup
+
+Follow the following steps:
+
+1.  ```bash
+    git clone https://github.com/VinciGit00/AmazScraper.git
+    ```
+2.  ```bash
+    pip install -r requirements.txt
+    ```
+3.  Go to [https://openai.com](https://openai.com/) and login
+4.  Now you can access to [https://platform.openai.com/docs/overview](https://platform.openai.com/docs/overview)
+5.  Create a new API key and copy it
+    ![Screenshot 2024-01-26 alle 17.10.10.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.10.png)
+
+![Screenshot 2024-01-26 alle 17.10.31.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.31.png)
+
+![Screenshot 2024-01-26 alle 17.10.52.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.10.52.png)
+
+![Screenshot 2024-01-26 alle 17.11.10.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/Screenshot_2024-01-26_alle_17.11.10.png)
+
+6. Open the .env file inside main and paste the API key
+
+```config
+API_KEY="your openai.com api key"
+```
+
+7. You are ready to go! 🚀
+
+# Practical use
+
+## Using AmazScraper as a library
+
+```python
+from AmazScraper.ClassGenerator import Generator
+
+from AmazScraper.getter import get_function, scraper
+
+values = [
+    {
+        "title": "title",
+        "type": "str",
+        "description": "Title of the items"
+    }
+]
+
+if __name__ == "__main__":
+
+    query_info = scraper("https://www.mockupworld.co", 4197)
+    generator_instance = Generator(values)
+
+    res = generator_instance.invocation(query_info)
+```
+
+### Case 2: Passing your own HTML code
+
+```python
+import sys
+from AmazScraper.ClassGenerator import Generator
+
+values = [
+    {
+        "title": "title",
+        "type": "str",
+        "description": "Title of the news"
+    }
+]
+
+# Example using a HTML code
+query_info = '''
+        Given this code extract all the information in a json format about the news.
+        <article class="c-card__wrapper aem_card_check_wrapper" data-cardindex="0">
+            <div class="c-card__content">
+                <h2 class="c-card__title">Booker show da 52 punti, chi ha più partite oltre quota 50</h2>
+                <div class="c-card__label-wrapper c-label-wrapper">
+                    <span class="c-label c-label--article-heading">LA CLASSIFICA</span>
+                </div>
+                <p class="c-card__abstract">Il n° 1 dei Suns ha dominato la sfida vinta a New Orleans segnando 52 punti. Si tratta della...</p>
+                <div class="c-card__info">
+                    <time class="c-card__date" datetime="20 gen - 07:54">20 gen - 07:54</time>
+                    <span class="c-card__content-data">
+                        <i class="icon icon--media-outline icon--gallery-outline icon--xxsmall icon--c-neutral">
+                            <svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery-outline">
+                                <path d="M26.174 32.174v31.975h44.588V32.174H26.174zm-3.08-9.238h50.747A6.159 6.159 0 0 1 80 29.095v38.134a6.159 6.159 0 0 1-6.159 6.158H23.095a6.159 6.159 0 0 1-6.159-6.158V29.095a6.159 6.159 0 0 1 6.159-6.159zM9.239 55.665a4.619 4.619 0 0 1-9.238 0V16.777C0 10.825 4.825 6 10.777 6H64.08a4.619 4.619 0 1 1 0 9.238H10.777c-.85 0-1.54.69-1.54 1.54v38.887z" fill="currentColor" fill-rule="evenodd"></path>
+                            </svg>
+                        </i>
+                        28 foto
+                    </span>
+                </div>
+            </div>
+            <div class="c-card__img-wrapper">
+                <figure class="o-aspect-ratio o-aspect-ratio--16-10 ">
+                    <img crossorigin="anonymous" class="c-card__img j-lazyload" alt="Partite con 50+ punti: Booker in Top-20" data-srcset="..." sizes="..." loading="lazy" data-src="...">
+                    <noscript>
+                        <img crossorigin="anonymous" class="c-card__img" alt="Partite con 50+ punti: Booker in Top-20" srcset="..." sizes="..." src="...">
+                    </noscript>
+                </figure>
+                <i class="icon icon--media icon--gallery icon--medium icon--c-primary">
+                    <svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery">
+                        <path d="M17.005 20.221h60.211c1.538 0 2.784 1.28 2.784 2.858v48.317c0 1.578-1.246 2.858-2.784 2.858H17.005c-1.537 0-2.784-1.28-2.784-2.858V23.079c0-1.578 1.247-2.858 2.784-2.858zM5.873 11.873V60.62a2.937 2.937 0 0 1-5.873 0V11.286A5.286 5.286 0 0 1 5.286 6h61.08a2.937 2.937 0 1 1 0 5.873H5.873z"></path>
+                    </svg>
+                </i>
+            </div>
+        </article>
+    '''
+
+if __name__ == "__main__":
+    generator_instance = Generator(values)
+
+    generator_instance.invocation(query_info)
+```
+
+Developed by
+
+![logo-removebg-preview.png](%F0%9F%A4%96%20AmazScraper%201c39ca53921c406da6a86898af0564ae/logo-removebg-preview.png)
@@ -0,0 +1,14 @@
+base_script = '''
+from langchain_core.pydantic_v1 import BaseModel, Field
+
+class Response(BaseModel):
+'''
+
+# This function creates a class at runtime using the values from the list.
+def create_class(data_dict: dict):
+    for elem in data_dict:
+        global base_script
+        base_script = base_script + f"    {elem['title']}: {elem['type']} = Field(description='{elem['description']}')\n"
+
+    with open("AmazScraper/pydantic_class.py", "w") as f:
+        f.write(base_script)
@@ -0,0 +1,53 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
+'Accept-Language': 'en-US'}
+
+
+def get_function(link:str, param = HEADERS) -> str:
+    """
+    It sends a GET request to the specified link with optional headers.
+
+    Parameters:
+    link (str): The URL to send the GET request to.
+    param (dict): Optional headers to include in the request. Default is HEADERS.
+
+    Returns:
+    str: The content of the response as a string.
+    """
+    response = requests.get(url=link, headers=HEADERS)
+    return str(response.content)
+
+
+
+
+def scraper(link: str, max_char: int) -> str:
+    """
+    Scrapes the HTML text and removes unwanted elements, text, and comments.
+
+    Args:
+        link (str): The HTML link to be scraped.
+        max_char (int): The maximum number of characters in the returned HTML body.
+
+    Returns:
+        str: The scraped HTML body as a string without script meta tags and limited to max_char characters.
+    """
+    text = get_function(link)
+    soup = BeautifulSoup(text, 'html.parser')
+
+    unwanted_elements = ['head', 'script', 'style']
+    unwanted_text = "Per discutere l'accesso automatizzato ai dati di Amazon"
+    unwanted_comment = "Correios.DoNotSend"
+
+    for element in soup(unwanted_elements):
+        element.decompose()
+
+    for unwanted_content in soup.find_all(string=lambda text: unwanted_text in text or unwanted_comment in text):
+        unwanted_content.extract()
+
+    html_body = str(soup.body).replace('\n', '')
+
+    # Limit the number of characters in the HTML body
+    return html_body[:max_char]
@@ -0,0 +1,5 @@
+
+from langchain_core.pydantic_v1 import BaseModel, Field
+
+class Response(BaseModel):
+    title_swebsite: str = Field(description='Title of the items')
@@ -0,0 +1,6 @@
+beautifulsoup4==4.12.3
+langchain==0.1.4
+langchain_core==0.1.16
+langchain_openai==0.0.5
+python-dotenv==1.0.1
+Requests==2.31.0