@@ -22,9 +22,26 @@ This demo showcases how to use Trigger.dev with Python to build a web crawler th
22
22
- Our [ Python build extension] ( /config/extensions/pythonExtension ) to install the dependencies and run the Python script
23
23
- [ Crawl4AI] ( https://github.com/unclecode/crawl4ai ) , an open source LLM friendly web crawler
24
24
- A custom [ Playwright extension] ( https://playwright.dev/ ) to create a headless chromium browser
25
+ - Proxy support
26
+
27
+ ## Using Proxies
25
28
26
29
<ScrapingWarning />
27
30
31
+ Some popular proxy services are:
32
+
33
+ - [ Smartproxy] ( https://smartproxy.com/ )
34
+ - [ Bright Data] ( https://brightdata.com/ )
35
+ - [ Browserbase] ( https://browserbase.com/ )
36
+ - [ Oxylabs] ( https://oxylabs.io/ )
37
+ - [ ScrapingBee] ( https://scrapingbee.com/ )
38
+
39
+ Once you have a proxy service, set the following environment variables in your Trigger.dev .env file, and add them in the Trigger.dev dashboard:
40
+
41
+ - ` PROXY_URL ` : The URL of your proxy server (e.g., ` http://proxy.example.com:8080 ` )
42
+ - ` PROXY_USERNAME ` : Username for authenticated proxies (optional)
43
+ - ` PROXY_PASSWORD ` : Password for authenticated proxies (optional)
44
+
28
45
## GitHub repo
29
46
30
47
<Card
@@ -113,7 +130,14 @@ export const convertUrlToMarkdown = schemaTask({
113
130
url: z .string ().url (),
114
131
}),
115
132
run : async (payload ) => {
116
- const result = await python .runScript (" ./src/python/crawl-url.py" , [payload .url ]);
133
+ // Pass through any proxy environment variables
134
+ const env = {
135
+ PROXY_URL: process .env .PROXY_URL ,
136
+ PROXY_USERNAME: process .env .PROXY_USERNAME ,
137
+ PROXY_PASSWORD: process .env .PROXY_PASSWORD ,
138
+ };
139
+
140
+ const result = await python .runScript (" ./src/python/crawl-url.py" , [payload .url ], { env });
117
141
118
142
logger .debug (" convert-url-to-markdown" , {
119
143
url: payload .url ,
@@ -142,10 +166,34 @@ The Python script is a simple script using Crawl4AI that takes a URL and returns
142
166
``` python src/python/crawl-url.py
143
167
import asyncio
144
168
import sys
169
+ import os
145
170
from crawl4ai import *
171
+ from crawl4ai.async_configs import BrowserConfig
146
172
147
173
async def main (url : str ):
148
- async with AsyncWebCrawler() as crawler:
174
+ # Get proxy configuration from environment variables
175
+ proxy_url = os.environ.get(" PROXY_URL" )
176
+ proxy_username = os.environ.get(" PROXY_USERNAME" )
177
+ proxy_password = os.environ.get(" PROXY_PASSWORD" )
178
+
179
+ # Configure the proxy
180
+ browser_config = None
181
+ if proxy_url:
182
+ if proxy_username and proxy_password:
183
+ # Use authenticated proxy
184
+ proxy_config = {
185
+ " server" : proxy_url,
186
+ " username" : proxy_username,
187
+ " password" : proxy_password
188
+ }
189
+ browser_config = BrowserConfig(proxy_config = proxy_config)
190
+ else :
191
+ # Use simple proxy
192
+ browser_config = BrowserConfig(proxy = proxy_url)
193
+ else :
194
+ browser_config = BrowserConfig()
195
+
196
+ async with AsyncWebCrawler(config = browser_config) as crawler:
149
197
result = await crawler.arun(
150
198
url = url,
151
199
)
0 commit comments