Skip to content

Commit d0976dd

Browse files
authored
Merge branch 'main' into temp-1
2 parents e5ac020 + d116b77 commit d0976dd

27 files changed

+63
-171
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
## [1.22.0-beta.3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.22.0-beta.2...v1.22.0-beta.3) (2024-09-25)
22

33

4+
45
### Bug Fixes
56

67
* update to pydantic documentation ([76ce257](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/76ce257efb9d9f46c0693472a1fe54b39e4eb1ef))
@@ -15,6 +16,7 @@
1516
## [1.22.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.21.2-beta.2...v1.22.0-beta.1) (2024-09-24)
1617

1718

19+
1820
### Features
1921

2022
* add info to the dictionary for toghtherai ([3b5ee76](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b5ee767cbb91cb0ca8e4691195d16c3b57140bb))

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ name = "scrapegraphai"
33

44
version = "1.22.0b3"
55

6+
67
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
78
authors = [
89
{ name = "Marco Vinciguerra", email = "[email protected]" },

scrapegraphai/graphs/csv_scraper_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
Module for creating the smart scraper
33
"""
4-
54
from typing import Optional
65
from pydantic import BaseModel
76
from .base_graph import BaseGraph

scrapegraphai/graphs/csv_scraper_multi_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
CSVScraperMultiGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import List, Optional
76
from pydantic import BaseModel

scrapegraphai/graphs/json_scraper_multi_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
JSONScraperMultiGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import List, Optional
76
from pydantic import BaseModel

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
OmniScraperGraph Module
33
"""
4-
54
from typing import Optional
65
from pydantic import BaseModel
76
from .base_graph import BaseGraph

scrapegraphai/graphs/omni_search_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
OmniSearchGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import Optional
76
from pydantic import BaseModel

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
"""
32
PDFScraperGraph Module
43
"""

scrapegraphai/graphs/pdf_scraper_multi_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
PdfScraperMultiGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import List, Optional
76
from pydantic import BaseModel

scrapegraphai/graphs/script_creator_multi_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
ScriptCreatorMultiGraph Module
33
"""
4-
54
from typing import List, Optional
65
from pydantic import BaseModel
76
from .base_graph import BaseGraph

scrapegraphai/graphs/search_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
SearchGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import Optional, List
76
from pydantic import BaseModel

scrapegraphai/graphs/search_link_graph.py

Lines changed: 32 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from pydantic import BaseModel
77
from .base_graph import BaseGraph
88
from .abstract_graph import AbstractGraph
9-
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
9+
from ..nodes import (FetchNode,
10+
SearchLinkNode,
11+
SearchLinksWithContext)
1012

11-
class SearchLinkGraph(AbstractGraph):
13+
class SearchLinkGraph(AbstractGraph):
1214
"""
1315
SearchLinkGraph is a scraping pipeline that automates the process of
1416
extracting information from web pages using a natural language model
@@ -30,13 +32,7 @@ class SearchLinkGraph(AbstractGraph):
3032
config (dict): Configuration parameters for the graph.
3133
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
3234
33-
Example:
34-
>>> smart_scraper = SearchLinkGraph(
35-
... "List me all the attractions in Chioggia.",
36-
... "https://en.wikipedia.org/wiki/Chioggia",
37-
... {"llm": {"model": "openai/gpt-3.5-turbo"}}
38-
... )
39-
>>> result = smart_scraper.run()
35+
4036
"""
4137

4238
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
@@ -51,45 +47,41 @@ def _create_graph(self) -> BaseGraph:
5147
Returns:
5248
BaseGraph: A graph instance representing the web scraping workflow.
5349
"""
54-
5550
fetch_node = FetchNode(
56-
input="url| local_dir",
57-
output=["doc"],
58-
node_config={
59-
"llm_model": self.llm_model,
60-
"force": self.config.get("force", False),
61-
"cut": self.config.get("cut", True),
62-
"loader_kwargs": self.config.get("loader_kwargs", {}),
63-
}
64-
)
65-
parse_node = ParseNode(
66-
input="doc",
67-
output=["parsed_doc"],
68-
node_config={
69-
"chunk_size": self.model_token,
70-
"llm_model": self.llm_model
71-
}
72-
)
73-
search_link_node = SearchLinkNode(
74-
input="doc",
75-
output=["parsed_doc"],
76-
node_config={
77-
"llm_model": self.llm_model,
78-
"chunk_size": self.model_token,
79-
"filter_links": self.config.get("filter_links", None),
80-
"filter_config": self.config.get("filter_config", None)
81-
}
82-
)
51+
input="url| local_dir",
52+
output=["doc"],
53+
node_config={
54+
"force": self.config.get("force", False),
55+
"cut": self.config.get("cut", True),
56+
"loader_kwargs": self.config.get("loader_kwargs", {}),
57+
}
58+
)
59+
60+
if self.config.get("llm_style") == (True, None):
61+
search_link_node = SearchLinksWithContext(
62+
input="doc",
63+
output=["parsed_doc"],
64+
node_config={
65+
"llm_model": self.llm_model,
66+
"chunk_size": self.model_token,
67+
}
68+
)
69+
else:
70+
search_link_node = SearchLinkNode(
71+
input="doc",
72+
output=["parsed_doc"],
73+
node_config={
74+
"chunk_size": self.model_token,
75+
}
76+
)
8377

8478
return BaseGraph(
8579
nodes=[
8680
fetch_node,
87-
parse_node,
8881
search_link_node
8982
],
9083
edges=[
91-
(fetch_node, parse_node),
92-
(parse_node, search_link_node)
84+
(fetch_node, search_link_node)
9385
],
9486
entry_point=fetch_node,
9587
graph_name=self.__class__.__name__

scrapegraphai/graphs/smart_scraper_multi_concat_graph.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
SmartScraperMultiGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import List, Optional
76
from pydantic import BaseModel
@@ -14,7 +13,6 @@
1413
)
1514
from ..utils.copy import safe_deepcopy
1615

17-
1816
class SmartScraperMultiConcatGraph(AbstractGraph):
1917
"""
2018
SmartScraperMultiGraph is a scraping pipeline that scrapes a
@@ -43,9 +41,8 @@ class SmartScraperMultiConcatGraph(AbstractGraph):
4341
>>> result = search_graph.run()
4442
"""
4543

46-
def __init__(self, prompt: str, source: List[str],
44+
def __init__(self, prompt: str, source: List[str],
4745
config: dict, schema: Optional[BaseModel] = None):
48-
4946
self.copy_config = safe_deepcopy(config)
5047

5148
self.copy_schema = deepcopy(schema)

scrapegraphai/graphs/smart_scraper_multi_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
SmartScraperMultiGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import List, Optional
76
from pydantic import BaseModel

scrapegraphai/graphs/xml_scraper_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
XMLScraperGraph Module
33
"""
4-
54
from typing import Optional
65
from pydantic import BaseModel
76
from .base_graph import BaseGraph

scrapegraphai/graphs/xml_scraper_multi_graph.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
XMLScraperMultiGraph Module
33
"""
4-
54
from copy import deepcopy
65
from typing import List, Optional
76
from pydantic import BaseModel

scrapegraphai/nodes/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@
2525
from .concat_answers_node import ConcatAnswersNode
2626
from .prompt_refiner_node import PromptRefinerNode
2727
from .html_analyzer_node import HtmlAnalyzerNode
28-
from .generate_code_node import GenerateCodeNode
28+
from .generate_code_node import GenerateCodeNode
29+
from .search_node_with_context import SearchLinksWithContext

scrapegraphai/nodes/base_node.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
"""
22
BaseNode Module
33
"""
4-
54
import re
65
from abc import ABC, abstractmethod
76
from typing import List, Optional
87
from ..utils import get_logger
98

10-
119
class BaseNode(ABC):
1210
"""
1311
An abstract base class for nodes in a graph-based workflow,

scrapegraphai/nodes/concat_answers_node.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
ConcatAnswersNode Module
33
"""
4-
54
from typing import List, Optional
65
from ..utils.logging import get_logger
76
from .base_node import BaseNode

scrapegraphai/nodes/conditional_node.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def __init__(self):
3232
"""
3333
Initializes an empty ConditionalNode.
3434
"""
35-
3635
#super().__init__(node_name, "node", input, output, 2, node_config)
3736
pass
3837

scrapegraphai/nodes/fetch_screen_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def __init__(
1616
input: str,
1717
output: List[str],
1818
node_config: Optional[dict] = None,
19-
node_name: str = "FetchScreenNode",
19+
node_name: str = "FetchScreen",
2020
):
2121
super().__init__(node_name, "node", input, output, 2, node_config)
2222
self.url = node_config.get("link")

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
Module for generating the answer node
33
"""
4-
54
from typing import List, Optional
65
from langchain.prompts import PromptTemplate
76
from langchain_core.output_parsers import JsonOutputParser

scrapegraphai/nodes/graph_iterator_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import asyncio
55
from typing import List, Optional
66
from tqdm.asyncio import tqdm
7-
from .base_node import BaseNode
87
from pydantic import BaseModel
8+
from .base_node import BaseNode
99

1010
DEFAULT_BATCHSIZE = 16
1111

scrapegraphai/nodes/parse_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(
3333
input: str,
3434
output: List[str],
3535
node_config: Optional[dict] = None,
36-
node_name: str = "Parse",
36+
node_name: str = "ParseNode",
3737
):
3838
super().__init__(node_name, "node", input, output, 1, node_config)
3939

@@ -88,7 +88,7 @@ def execute(self, state: dict) -> dict:
8888
link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
8989

9090
chunk_size = self.chunk_size
91-
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
91+
chunk_size = min(chunk_size - 500, int(chunk_size * 0.75))
9292

9393
if isinstance(docs_transformed, Document):
9494
chunks = split_text_into_chunks(text=docs_transformed.page_content,

0 commit comments

Comments
 (0)