|
| 1 | +# 🤖 AmazScraper |
| 2 | + |
| 3 | +This repo is a Python open source library for making a faster scraping using AI and without any knowledge about the HTML code. |
| 4 | + |
| 5 | +The tech stack is fully in Python and the main libraries used are pydantic, langchain and requests. |
| 6 | + |
| 7 | +The use of this library allows to scrape and extract informations from websites in just few seconds instead of write ad-hoc code for each website. |
| 8 | + |
| 9 | +This library can work passing as a parameter from the code the HTML to scrape or it can work passing the |
| 10 | +link of the website that you want to extract informations. |
| 11 | + |
| 12 | +# Setup |
| 13 | + |
| 14 | +Follow the following steps: |
| 15 | + |
| 16 | +1. ```bash |
| 17 | + git clone https://github.com/VinciGit00/AmazScraper.git |
| 18 | + ``` |
| 19 | +2. ```bash |
| 20 | + pip install -r requirements.txt |
| 21 | + ``` |
| 22 | +3. Go to [https://openai.com](https://openai.com/) and login |
| 23 | +4. Now you can access to [https://platform.openai.com/docs/overview](https://platform.openai.com/docs/overview) |
| 24 | +5. Create a new API key and copy it |
| 25 | +  |
| 26 | + |
| 27 | + |
| 28 | + |
| 29 | + |
| 30 | + |
| 31 | + |
| 32 | + |
| 33 | +6. Open the .env file inside main and paste the API key |
| 34 | + |
| 35 | +```config |
| 36 | +API_KEY="your openai.com api key" |
| 37 | +``` |
| 38 | + |
| 39 | +7. You are ready to go! 🚀 |
| 40 | + |
| 41 | +# Practical use |
| 42 | + |
| 43 | +## Using AmazScraper as a library |
| 44 | + |
| 45 | +```python |
| 46 | +from AmazScraper.ClassGenerator import Generator |
| 47 | +
|
| 48 | +from AmazScraper.getter import get_function, scraper |
| 49 | +
|
| 50 | +values = [ |
| 51 | + { |
| 52 | + "title": "title", |
| 53 | + "type": "str", |
| 54 | + "description": "Title of the items" |
| 55 | + } |
| 56 | +] |
| 57 | +
|
| 58 | +if __name__ == "__main__": |
| 59 | +
|
| 60 | + query_info = scraper("https://www.mockupworld.co", 4197) |
| 61 | + generator_instance = Generator(values) |
| 62 | +
|
| 63 | + res = generator_instance.invocation(query_info) |
| 64 | +``` |
| 65 | +
|
| 66 | +### Case 2: Passing your own HTML code |
| 67 | +
|
| 68 | +```python |
| 69 | +import sys |
| 70 | +from AmazScraper.ClassGenerator import Generator |
| 71 | +
|
| 72 | +values = [ |
| 73 | + { |
| 74 | + "title": "title", |
| 75 | + "type": "str", |
| 76 | + "description": "Title of the news" |
| 77 | + } |
| 78 | +] |
| 79 | +
|
| 80 | +# Example using a HTML code |
| 81 | +query_info = ''' |
| 82 | + Given this code extract all the information in a json format about the news. |
| 83 | + <article class="c-card__wrapper aem_card_check_wrapper" data-cardindex="0"> |
| 84 | + <div class="c-card__content"> |
| 85 | + <h2 class="c-card__title">Booker show da 52 punti, chi ha più partite oltre quota 50</h2> |
| 86 | + <div class="c-card__label-wrapper c-label-wrapper"> |
| 87 | + <span class="c-label c-label--article-heading">LA CLASSIFICA</span> |
| 88 | + </div> |
| 89 | + <p class="c-card__abstract">Il n° 1 dei Suns ha dominato la sfida vinta a New Orleans segnando 52 punti. Si tratta della...</p> |
| 90 | + <div class="c-card__info"> |
| 91 | + <time class="c-card__date" datetime="20 gen - 07:54">20 gen - 07:54</time> |
| 92 | + <span class="c-card__content-data"> |
| 93 | + <i class="icon icon--media-outline icon--gallery-outline icon--xxsmall icon--c-neutral"> |
| 94 | + <svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery-outline"> |
| 95 | + <path d="M26.174 32.174v31.975h44.588V32.174H26.174zm-3.08-9.238h50.747A6.159 6.159 0 0 1 80 29.095v38.134a6.159 6.159 0 0 1-6.159 6.158H23.095a6.159 6.159 0 0 1-6.159-6.158V29.095a6.159 6.159 0 0 1 6.159-6.159zM9.239 55.665a4.619 4.619 0 0 1-9.238 0V16.777C0 10.825 4.825 6 10.777 6H64.08a4.619 4.619 0 1 1 0 9.238H10.777c-.85 0-1.54.69-1.54 1.54v38.887z" fill="currentColor" fill-rule="evenodd"></path> |
| 96 | + </svg> |
| 97 | + </i> |
| 98 | + 28 foto |
| 99 | + </span> |
| 100 | + </div> |
| 101 | + </div> |
| 102 | + <div class="c-card__img-wrapper"> |
| 103 | + <figure class="o-aspect-ratio o-aspect-ratio--16-10 "> |
| 104 | + <img crossorigin="anonymous" class="c-card__img j-lazyload" alt="Partite con 50+ punti: Booker in Top-20" data-srcset="..." sizes="..." loading="lazy" data-src="..."> |
| 105 | + <noscript> |
| 106 | + <img crossorigin="anonymous" class="c-card__img" alt="Partite con 50+ punti: Booker in Top-20" srcset="..." sizes="..." src="..."> |
| 107 | + </noscript> |
| 108 | + </figure> |
| 109 | + <i class="icon icon--media icon--gallery icon--medium icon--c-primary"> |
| 110 | + <svg width="80" height="80" viewBox="0 0 80 80" xmlns="http://www.w3.org/2000/svg" class="icon__svg icon__svg--gallery"> |
| 111 | + <path d="M17.005 20.221h60.211c1.538 0 2.784 1.28 2.784 2.858v48.317c0 1.578-1.246 2.858-2.784 2.858H17.005c-1.537 0-2.784-1.28-2.784-2.858V23.079c0-1.578 1.247-2.858 2.784-2.858zM5.873 11.873V60.62a2.937 2.937 0 0 1-5.873 0V11.286A5.286 5.286 0 0 1 5.286 6h61.08a2.937 2.937 0 1 1 0 5.873H5.873z"></path> |
| 112 | + </svg> |
| 113 | + </i> |
| 114 | + </div> |
| 115 | + </article> |
| 116 | + ''' |
| 117 | +
|
| 118 | +if __name__ == "__main__": |
| 119 | + generator_instance = Generator(values) |
| 120 | +
|
| 121 | + generator_instance.invocation(query_info) |
| 122 | +``` |
| 123 | +
|
| 124 | +Developed by |
| 125 | +
|
| 126 | + |
0 commit comments