Skip to content

Replacement of Google search to googlesearch-python and integration of Proxy. #743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions examples/together/code_generator_graph_togehter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Basic example of scraping pipeline using Code Generator with schema
"""

import os, json
from typing import List
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from scrapegraphai.graphs import CodeGeneratorGraph

load_dotenv()

# ************************************************
# Define the output schema for the graph
# ************************************************

class Project(BaseModel):
title: str = Field(description="The title of the project")
description: str = Field(description="The description of the project")

class Projects(BaseModel):
projects: List[Project]

# ************************************************
# Define the configuration for the graph
# ************************************************

together_key = os.getenv("TOGETHER_KEY")

graph_config = {
"llm": {
"model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
"api_key": together_key,
},
"verbose": True,
"headless": False,
"reduction": 2,
"max_iterations": {
"overall": 10,
"syntax": 3,
"execution": 3,
"validation": 3,
"semantic": 3
},
"output_file_name": "extracted_data.py"
}

# ************************************************
# Create the SmartScraperGraph instance and run it
# ************************************************

code_generator_graph = CodeGeneratorGraph(
prompt="List me all the projects with their description",
source="https://perinim.github.io/projects/",
schema=Projects,
config=graph_config
)

result = code_generator_graph.run()
print(result)
7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@ dependencies = [
"free-proxy>=1.1.1",
"playwright>=1.43.0",
"undetected-playwright>=0.3.0",
"google>=3.0.0",
"langchain-ollama>=0.1.3",
"simpleeval>=1.0.0",
"semchunk>=2.2.0",
"transformers>=4.44.2",
"qdrant-client>=1.11.3",
"fastembed>=0.3.6"
"semchunk>=2.2.0",
"transformers>=4.44.2",
"googlesearch-python>=1.2.5"
]

license = "MIT"
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ minify-html>=0.15.0
free-proxy>=1.1.1
playwright>=1.43.0
undetected-playwright>=0.3.0
google>=3.0.0
semchunk>=1.0.1
langchain-ollama>=0.1.3
simpleeval>=0.9.13
simpleeval>=0.9.13
googlesearch-python>=1.2.5
1 change: 1 addition & 0 deletions scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def _create_graph(self) -> BaseGraph:
node_config={
"llm_model": self.llm_model,
"max_results": self.max_results,
"loader_kwargs": self.loader_kwargs,
"search_engine": self.copy_config.get("search_engine")
}
)
Expand Down
5 changes: 3 additions & 2 deletions scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
self.verbose = (
False if node_config is None else node_config.get("verbose", False)
)
self.proxy = node_config.get("loader_kwargs", {}).get("proxy", None)
self.search_engine = (
node_config["search_engine"]
if node_config.get("search_engine")
Expand Down Expand Up @@ -93,8 +94,8 @@ def execute(self, state: dict) -> dict:

self.logger.info(f"Search Query: {search_query}")

answer = search_on_web(query=search_query, max_results=self.max_results,
search_engine=self.search_engine)
answer = search_on_web(query=search_query, num_results=self.max_results,
search_engine=self.search_engine, proxy=self.proxy)

if len(answer) == 0:
raise ValueError("Zero results found for the search query.")
Expand Down
24 changes: 22 additions & 2 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def search_on_web(query: str, search_engine: str = "Google",
max_results: int = 10, port: int = 8080,
timeout: int = 10) -> List[str]:
timeout: int = 10, proxy: str | dict = None) -> List[str]:
"""
Searches the web for a given query using specified search
engine options and filters out PDF links.
Expand All @@ -23,6 +23,7 @@ def search_on_web(query: str, search_engine: str = "Google",
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080.
timeout (int, optional): The number of seconds to wait
for a response from a request. Default is 10 seconds.
proxy (dict or string, optional): The proxy server to use for the request. Default is None.

Returns:
List[str]: A list of URLs as strings that are the search results, excluding any PDF links.
Expand All @@ -36,6 +37,22 @@ def search_on_web(query: str, search_engine: str = "Google",
['http://example.com', 'http://example.org', ...]
"""

def format_proxy(proxy):
if isinstance(proxy, dict):
server = proxy.get('server')
username = proxy.get('username')
password = proxy.get('password')

if all([username, password, server]):
proxy_url = f"http://{username}:{password}@{server}"
return proxy_url
else:
raise ValueError("Proxy dictionary is missing required fields.")
elif isinstance(proxy, str):
return proxy # "https://username:password@ip:port"
else:
raise TypeError("Proxy should be a dictionary or a string.")

def filter_pdf_links(links: List[str]) -> List[str]:
"""
Filters out any links that point to PDF files.
Expand All @@ -48,9 +65,12 @@ def filter_pdf_links(links: List[str]) -> List[str]:
"""
return [link for link in links if not link.lower().endswith('.pdf')]

if proxy:
proxy = format_proxy(proxy)

if search_engine.lower() == "google":
res = []
for url in google_search(query, stop=max_results):
for url in google_search(query, num_results=max_results, proxy=proxy):
res.append(url)
return filter_pdf_links(res)

Expand Down