Skip to content

Commit bb2373d

Browse files
authored
Merge pull request #775 from U-C4N/main
This commit focuses on optimizing the utility modules in the codebase…
2 parents deed355 + 09c9678 commit bb2373d

File tree

19 files changed

+279
-165
lines changed

19 files changed

+279
-165
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
{
2+
"version": "v1.0.0",
3+
"entity": {
4+
"type": "individual",
5+
"role": "maintainer",
6+
"name": "Marco Vinciguerra",
7+
"email": "[email protected]",
8+
"phone": "",
9+
"description": "I'm dedicated to advancing web scraping and data extraction through AI-powered tools, focusing on making data access more accessible and ethical. My mission is to create solutions that uphold digital freedoms and support open internet principles.",
10+
"webpageUrl": {
11+
"url": "https://scrapegraphai.com",
12+
}
13+
},
14+
"projects": [
15+
{
16+
"guid": "scrapegraph-core",
17+
"name": "ScrapeGraphAI Core",
18+
"description": "An AI-powered web scraping framework that intelligently extracts structured data from websites with automatic pattern recognition, adaptive scraping strategies, and built-in rate limiting. Recognized as a top 200 open-source AI project globally.",
19+
"webpageUrl": {
20+
"url": "https://scrapegraphai.com/projects/core",
21+
},
22+
"repositoryUrl": {
23+
"url": "https://github.com/ScrapeGraphAI/Scrapegraph-ai",
24+
},
25+
"licenses": ["spdx:MIT"],
26+
"tags": ["web-scraping", "ai", "data-extraction", "python", "machine-learning", "open-source", "llm"]
27+
}
28+
],
29+
"funding": {
30+
"channels": [
31+
{
32+
"guid": "mybank",
33+
"type": "bank",
34+
"address": "",
35+
"description": "Will accept direct bank transfers. Please e-mail me for details."
36+
},
37+
{
38+
"guid": "mypay",
39+
"type": "payment-provider",
40+
"address": "https://example.com/payme/@myid",
41+
"description": "Pay with your debit/credit card through this gateway and set up recurring subscriptions."
42+
}
43+
],
44+
"plans": [
45+
{
46+
"guid": "infrastructure",
47+
"status": "active",
48+
"name": "Infrastructure Support",
49+
"description": "Help cover monthly cloud infrastructure costs, including API servers, model hosting, and data storage.",
50+
"amount": 750,
51+
"currency": "USD",
52+
"frequency": "monthly",
53+
"channels": ["mybank"]
54+
},
55+
{
56+
"guid": "developer-compensation",
57+
"status": "active",
58+
"name": "Developer Compensation",
59+
"description": "Provides financial support for developers working on maintenance, updates, and feature additions for the projects.",
60+
"amount": 2500,
61+
"currency": "USD",
62+
"frequency": "monthly",
63+
"channels": ["mybank"]
64+
},
65+
{
66+
"guid": "community-backer",
67+
"status": "active",
68+
"name": "Community Backer",
69+
"description": "Support our open-source efforts with any contribution amount. Every donation helps!",
70+
"amount": 5,
71+
"currency": "USD",
72+
"frequency": "monthly",
73+
"channels": ["mypay"]
74+
}
75+
],
76+
"history": [
77+
{
78+
"year": 2024,
79+
"income": 15000,
80+
"expenses": 15000,
81+
"taxes": 0,
82+
"currency": "USD",
83+
"description": "Experienced a temporary dip in donations, with improvements expected."
84+
}
85+
]
86+
}
87+
}

CHANGELOG.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,50 @@
1-
## [1.27.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.12...v1.27.0-beta.13) (2024-10-29)
1+
## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26)
2+
3+
4+
### Features
5+
6+
* add conditional node structure to the smart_scraper_graph and implemented a structured way to check condition ([cacd9cd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cacd9cde004dace1a7dcc27981245632a78b95f3))
7+
* add integration with scrape.do ([ae275ec](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/ae275ec5e86c0bb8fdbeadc2e5f69816d1dea635))
8+
* add model integration gpt4 ([51c55eb](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/51c55eb3a2984ba60572edbcdea4c30620e18d76))
9+
* implement ScrapeGraph class for only web scraping automation ([612c644](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/612c644623fa6f4fe77a64a5f1a6a4d6cd5f4254))
10+
* Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt. ([3e3e1b2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3e3e1b2f3ae8ed803d03b3b44b199e139baa68d4))
11+
* refactoring of export functions ([0ea00c0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/0ea00c078f2811f0d1b356bd84cafde80763c703))
12+
* refactoring of get_probable_tags node ([f658092](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f658092dffb20ea111cc00950f617057482788f4))
13+
* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887))
14+
215

316

417
### Bug Fixes
518

19+
* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422))
20+
* fix the example variable name ([69ff649](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/69ff6495564a5c670b89c0f802ebb1602f0e7cfa))
21+
* remove variable "max_result" not being used in the code ([e76a68a](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e76a68a782e5bce48d421cb620d0b7bffa412918))
22+
23+
24+
### chore
25+
26+
* fix example ([9cd9a87](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9cd9a874f91bbbb2990444818e8ab2d0855cc361))
27+
28+
29+
### Test
30+
31+
* Add scrape_graph test ([cdb3c11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/cdb3c1100ee1117afedbc70437317acaf7c7c1d3))
32+
* Add smart_scraper_multi_parse_merge_first_graph test ([464b8b0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/464b8b04ea0d51280849173d5eda92d4d4db8612))
33+
34+
35+
### CI
36+
37+
* **release:** 1.26.6-beta.1 [skip ci] ([e0fc457](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/e0fc457d1a850f3306d473fbde55dd800133b404))
38+
* **release:** 1.27.0-beta.1 [skip ci] ([9266a36](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/9266a36b2efdf7027470d59aa14b654d68f7cb51))
39+
* **release:** 1.27.0-beta.10 [skip ci] ([eee131e](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/eee131e959a36a4471f72610eefbc1764808b6be))
40+
* **release:** 1.27.0-beta.2 [skip ci] ([d84d295](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/d84d29538985ef8d04badfed547c6fdc73d7774d))
41+
* **release:** 1.27.0-beta.3 [skip ci] ([f576afa](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f576afaf0c1dd6d1dbf79fd5e642f6dca9dbe862))
42+
* **release:** 1.27.0-beta.4 [skip ci] ([3d6bbcd](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3d6bbcdaa3828ff257adb22f2f7c1a46343de5b5))
43+
* **release:** 1.27.0-beta.5 [skip ci] ([5002c71](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/5002c713d5a76b2c2e4313f888d9768e3f3142e1))
44+
* **release:** 1.27.0-beta.6 [skip ci] ([94b9836](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/94b9836ef6cd9c24bb8c04d7049d5477cc8ed807))
45+
* **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c))
46+
* **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee))
47+
* **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f))
648
* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)
749

850
## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28)

docs/source/getting_started/examples.rst

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ OpenAI models
2222
graph_config = {
2323
"llm": {
2424
"api_key": openai_key,
25-
"model": "openai/gpt-3.5-turbo",
25+
"model": "openai/gpt-4o",
2626
},
2727
}
2828
@@ -67,11 +67,6 @@ After that, you can run the following code, using only your machine resources br
6767
"format": "json", # Ollama needs the format to be specified explicitly
6868
"model_tokens": 2000, # depending on the model set context length
6969
"base_url": "http://localhost:11434", # set ollama URL of the local host (YOU CAN CHANGE IT, if you have a different endpoint
70-
},
71-
"embeddings": {
72-
"model": "ollama/nomic-embed-text",
73-
"temperature": 0,
74-
"base_url": "http://localhost:11434", # set ollama URL
7570
}
7671
}
7772

docs/source/introduction/overview.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@ OpenAI Models
3232
- GPT-3.5 Turbo (16,385 tokens)
3333
- GPT-4 (8,192 tokens)
3434
- GPT-4 Turbo Preview (128,000 tokens)
35+
- GPT-4o (128000 tokens)
36+
- GTP-4o-mini (128000 tokens)
3537

3638
Azure OpenAI Models
3739
-------------------
3840
- GPT-3.5 Turbo (16,385 tokens)
3941
- GPT-4 (8,192 tokens)
4042
- GPT-4 Turbo Preview (128,000 tokens)
43+
- GPT-4o (128000 tokens)
44+
- GTP-4o-mini (128000 tokens)
4145

4246
Google AI Models
4347
----------------

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
[project]
22
name = "scrapegraphai"
33

4+
45
version = "1.27.0b13"
56

67

scrapegraphai/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""
2-
__init__.py file for scrapegraphai folder
2+
__init__.py file for scrapegraphai folder
33
"""

scrapegraphai/builders/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
__init__.py file for builders folder
2+
This module contains the builders for constructing various components in the ScrapeGraphAI application.
33
"""
44

55
from .graph_builder import GraphBuilder

scrapegraphai/docloaders/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
"""__init__.py file for docloaders folder"""
1+
"""
2+
This module handles document loading functionalities for the ScrapeGraphAI application.
3+
"""
24

35
from .chromium import ChromiumLoader
46
from .browser_base import browser_base_fetch

scrapegraphai/graphs/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
"""
2-
__init__.py file for graphs folder
1+
"""
2+
This module defines the graph structures and related functionalities for the ScrapeGraphAI application.
33
"""
44

55
from .abstract_graph import AbstractGraph

scrapegraphai/graphs/document_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
md_scraper module
2+
This module implements the Document Scraper Graph for the ScrapeGraphAI application.
33
"""
44
from typing import Optional
55
import logging

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
OmniScraperGraph Module
2+
This module implements the Omni Scraper Graph for the ScrapeGraphAI application.
33
"""
44
from typing import Optional
55
from pydantic import BaseModel

scrapegraphai/helpers/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
"""
2-
__init__.py for the helpers folder
1+
"""
2+
This module provides helper functions and utilities for the ScrapeGraphAI application.
33
"""
44
from .nodes_metadata import nodes_metadata
55
from .schemas import graph_schema

scrapegraphai/models/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
__init__.py file for models folder
2+
This module contains the model definitions used in the ScrapeGraphAI application.
33
"""
44
from .openai_itt import OpenAIImageToText
55
from .openai_tts import OpenAITextToSpeech

scrapegraphai/nodes/base_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
"""
2-
BaseNode Module
1+
"""
2+
This module defines the base node class for the ScrapeGraphAI application.
33
"""
44
import re
55
from abc import ABC, abstractmethod

scrapegraphai/nodes/fetch_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""""
1+
"""
22
FetchNode Module
33
"""
44
import json

scrapegraphai/prompts/description_node_prompts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
description node prompts
2+
This module contains prompts for description nodes in the ScrapeGraphAI application.
33
"""
44

55
DESCRIPTION_NODE_PROMPT = """

scrapegraphai/utils/cleanup_html.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,18 @@ def minify_html(html):
6060
"""
6161
minify_html function
6262
"""
63-
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
64-
65-
html = re.sub(r'>\s+<', '><', html)
66-
html = re.sub(r'\s+>', '>', html)
67-
html = re.sub(r'<\s+', '<', html)
68-
html = re.sub(r'\s+', ' ', html)
69-
html = re.sub(r'\s*=\s*', '=', html)
63+
# Combine multiple regex operations into one for better performance
64+
patterns = [
65+
(r'<!--.*?-->', '', re.DOTALL),
66+
(r'>\s+<', '><', 0),
67+
(r'\s+>', '>', 0),
68+
(r'<\s+', '<', 0),
69+
(r'\s+', ' ', 0),
70+
(r'\s*=\s*', '=', 0)
71+
]
72+
73+
for pattern, repl, flags in patterns:
74+
html = re.sub(pattern, repl, html, flags=flags)
7075

7176
return html.strip()
7277

scrapegraphai/utils/copy.py

Lines changed: 28 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -30,56 +30,38 @@ def is_boto3_client(obj):
3030

3131
def safe_deepcopy(obj: Any) -> Any:
3232
"""
33-
Attempts to create a deep copy of the object using `copy.deepcopy`
34-
whenever possible. If that fails, it falls back to custom deep copy
35-
logic. If that also fails, it raises a `DeepCopyError`.
36-
33+
Safely create a deep copy of an object, handling special cases.
34+
3735
Args:
38-
obj (Any): The object to be copied, which can be of any type.
39-
36+
obj: Object to copy
37+
4038
Returns:
41-
Any: A deep copy of the object if possible; otherwise, a shallow
42-
copy if deep copying fails; if neither is possible, the original
43-
object is returned.
39+
Deep copy of the object
40+
4441
Raises:
45-
DeepCopyError: If the object cannot be deep-copied or shallow-copied.
42+
DeepCopyError: If object cannot be deep copied
4643
"""
47-
4844
try:
49-
50-
return copy.deepcopy(obj)
51-
except (TypeError, AttributeError) as e:
52-
45+
# Handle special cases first
46+
if obj is None or isinstance(obj, (str, int, float, bool)):
47+
return obj
48+
49+
if isinstance(obj, (list, set)):
50+
return type(obj)(safe_deepcopy(v) for v in obj)
51+
5352
if isinstance(obj, dict):
54-
new_obj = {}
55-
56-
for k, v in obj.items():
57-
new_obj[k] = safe_deepcopy(v)
58-
return new_obj
59-
60-
elif isinstance(obj, list):
61-
new_obj = []
62-
63-
for v in obj:
64-
new_obj.append(safe_deepcopy(v))
65-
return new_obj
66-
67-
elif isinstance(obj, tuple):
68-
new_obj = tuple(safe_deepcopy(v) for v in obj)
69-
70-
return new_obj
71-
72-
elif isinstance(obj, frozenset):
73-
new_obj = frozenset(safe_deepcopy(v) for v in obj)
74-
return new_obj
75-
76-
elif is_boto3_client(obj):
53+
return {k: safe_deepcopy(v) for k, v in obj.items()}
54+
55+
if isinstance(obj, tuple):
56+
return tuple(safe_deepcopy(v) for v in obj)
57+
58+
if isinstance(obj, frozenset):
59+
return frozenset(safe_deepcopy(v) for v in obj)
60+
61+
if is_boto3_client(obj):
7762
return obj
78-
79-
else:
80-
try:
81-
return copy.copy(obj)
82-
except (TypeError, AttributeError):
83-
raise DeepCopyError(
84-
f"Cannot deep copy the object of type {type(obj)}"
85-
) from e
63+
64+
return copy.copy(obj)
65+
66+
except Exception as e:
67+
raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e

0 commit comments

Comments
 (0)