1
- """
2
- Chromium module
1
+ """"
2
+ chromium module
3
3
"""
4
4
import asyncio
5
5
from typing import Any , AsyncIterator , Iterator , List , Optional
6
6
from langchain_community .document_loaders .base import BaseLoader
7
7
from langchain_core .documents import Document
8
+ import aiohttp
9
+ import async_timeout
8
10
from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
9
11
10
12
logger = get_logger ("web-loader" )
@@ -21,6 +23,9 @@ class ChromiumLoader(BaseLoader):
21
23
urls: A list of URLs to scrape content from.
22
24
"""
23
25
26
+ RETRY_LIMIT = 3
27
+ TIMEOUT = 10
28
+
24
29
def __init__ (
25
30
self ,
26
31
urls : List [str ],
@@ -66,17 +71,29 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
66
71
67
72
Returns:
68
73
str: The scraped HTML content or an error message if an exception occurs.
69
-
70
74
"""
71
75
import undetected_chromedriver as uc
72
76
73
77
logger .info (f"Starting scraping with { self .backend } ..." )
74
78
results = ""
75
- try :
76
- driver = uc .Chrome (headless = self .headless )
77
- results = driver .get (url ).page_content
78
- except Exception as e :
79
- results = f"Error: { e } "
79
+ attempt = 0
80
+
81
+ while attempt < self .RETRY_LIMIT :
82
+ try :
83
+ async with async_timeout .timeout (self .TIMEOUT ):
84
+ driver = uc .Chrome (headless = self .headless )
85
+ driver .get (url )
86
+ results = driver .page_content
87
+ logger .info (f"Successfully scraped { url } " )
88
+ break
89
+ except (aiohttp .ClientError , asyncio .TimeoutError ) as e :
90
+ attempt += 1
91
+ logger .error (f"Attempt { attempt } failed: { e } " )
92
+ if attempt == self .RETRY_LIMIT :
93
+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
94
+ finally :
95
+ driver .quit ()
96
+
80
97
return results
81
98
82
99
async def ascrape_playwright (self , url : str ) -> str :
@@ -88,28 +105,36 @@ async def ascrape_playwright(self, url: str) -> str:
88
105
89
106
Returns:
90
107
str: The scraped HTML content or an error message if an exception occurs.
91
-
92
108
"""
93
109
from playwright .async_api import async_playwright
94
110
from undetected_playwright import Malenia
95
111
96
112
logger .info (f"Starting scraping with { self .backend } ..." )
97
113
results = ""
98
- async with async_playwright () as p :
99
- browser = await p .chromium .launch (
100
- headless = self .headless , proxy = self .proxy , ** self .browser_config
101
- )
114
+ attempt = 0
115
+
116
+ while attempt < self .RETRY_LIMIT :
102
117
try :
103
- context = await browser .new_context ()
104
- await Malenia .apply_stealth (context )
105
- page = await context .new_page ()
106
- await page .goto (url , wait_until = "domcontentloaded" )
107
- await page .wait_for_load_state (self .load_state )
108
- results = await page .content () # Simply get the HTML content
109
- logger .info ("Content scraped" )
110
- except Exception as e :
111
- results = f"Error: { e } "
112
- await browser .close ()
118
+ async with async_playwright () as p , async_timeout .timeout (self .TIMEOUT ):
119
+ browser = await p .chromium .launch (
120
+ headless = self .headless , proxy = self .proxy , ** self .browser_config
121
+ )
122
+ context = await browser .new_context ()
123
+ await Malenia .apply_stealth (context )
124
+ page = await context .new_page ()
125
+ await page .goto (url , wait_until = "domcontentloaded" )
126
+ await page .wait_for_load_state (self .load_state )
127
+ results = await page .content ()
128
+ logger .info ("Content scraped" )
129
+ break
130
+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
131
+ attempt += 1
132
+ logger .error (f"Attempt { attempt } failed: { e } " )
133
+ if attempt == self .RETRY_LIMIT :
134
+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
135
+ finally :
136
+ await browser .close ()
137
+
113
138
return results
114
139
115
140
def lazy_load (self ) -> Iterator [Document ]:
@@ -121,7 +146,6 @@ def lazy_load(self) -> Iterator[Document]:
121
146
122
147
Yields:
123
148
Document: The scraped content encapsulated within a Document object.
124
-
125
149
"""
126
150
scraping_fn = getattr (self , f"ascrape_{ self .backend } " )
127
151
0 commit comments