Skip to content

Commit a1ea09d

Browse files
committed
Added Python crawling (slithering?) example
1 parent 67115ca commit a1ea09d

File tree

5 files changed

+198
-50
lines changed

5 files changed

+198
-50
lines changed

docs/docs.json

Lines changed: 13 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,14 @@
1818
"groups": [
1919
{
2020
"group": "Getting started",
21-
"pages": [
22-
"introduction",
23-
"quick-start",
24-
"video-walkthrough",
25-
"how-it-works",
26-
"limits"
27-
]
21+
"pages": ["introduction", "quick-start", "video-walkthrough", "how-it-works", "limits"]
2822
},
2923
{
3024
"group": "Fundamentals",
3125
"pages": [
3226
{
3327
"group": "Tasks",
34-
"pages": [
35-
"tasks/overview",
36-
"tasks/schemaTask",
37-
"tasks/scheduled"
38-
]
28+
"pages": ["tasks/overview", "tasks/schemaTask", "tasks/scheduled"]
3929
},
4030
"triggering",
4131
"runs",
@@ -50,13 +40,7 @@
5040
"errors-retrying",
5141
{
5242
"group": "Wait",
53-
"pages": [
54-
"wait",
55-
"wait-for",
56-
"wait-until",
57-
"wait-for-event",
58-
"wait-for-request"
59-
]
43+
"pages": ["wait", "wait-for", "wait-until", "wait-for-event", "wait-for-request"]
6044
},
6145
"queue-concurrency",
6246
"versioning",
@@ -100,9 +84,7 @@
10084
},
10185
{
10286
"group": "Development",
103-
"pages": [
104-
"cli-dev"
105-
]
87+
"pages": ["cli-dev"]
10688
},
10789
{
10890
"group": "Deployment",
@@ -113,9 +95,7 @@
11395
"deployment/atomic-deployment",
11496
{
11597
"group": "Deployment integrations",
116-
"pages": [
117-
"vercel-integration"
118-
]
98+
"pages": ["vercel-integration"]
11999
}
120100
]
121101
},
@@ -166,12 +146,7 @@
166146
},
167147
{
168148
"group": "Using the Dashboard",
169-
"pages": [
170-
"run-tests",
171-
"troubleshooting-alerts",
172-
"replaying",
173-
"bulk-actions"
174-
]
149+
"pages": ["run-tests", "troubleshooting-alerts", "replaying", "bulk-actions"]
175150
},
176151
{
177152
"group": "Troubleshooting",
@@ -197,11 +172,7 @@
197172
},
198173
{
199174
"group": "Help",
200-
"pages": [
201-
"community",
202-
"help-slack",
203-
"help-email"
204-
]
175+
"pages": ["community", "help-slack", "help-email"]
205176
}
206177
]
207178
},
@@ -222,10 +193,7 @@
222193
},
223194
{
224195
"group": "Tasks API",
225-
"pages": [
226-
"management/tasks/trigger",
227-
"management/tasks/batch-trigger"
228-
]
196+
"pages": ["management/tasks/trigger", "management/tasks/batch-trigger"]
229197
},
230198
{
231199
"group": "Runs API",
@@ -271,9 +239,7 @@
271239
"groups": [
272240
{
273241
"group": "Introduction",
274-
"pages": [
275-
"guides/introduction"
276-
]
242+
"pages": ["guides/introduction"]
277243
},
278244
{
279245
"group": "Frameworks",
@@ -340,7 +306,8 @@
340306
"guides/example-projects/claude-thinking-chatbot",
341307
"guides/example-projects/realtime-fal-ai",
342308
"guides/example-projects/realtime-csv-importer",
343-
"guides/example-projects/vercel-ai-sdk-image-generator"
309+
"guides/example-projects/vercel-ai-sdk-image-generator",
310+
"guides/python/python-crawl4ai"
344311
]
345312
},
346313
{
@@ -386,10 +353,7 @@
386353
"href": "https://trigger.dev"
387354
},
388355
"api": {
389-
"openapi": [
390-
"openapi.yml",
391-
"v3-openapi.yaml"
392-
],
356+
"openapi": ["openapi.yml", "v3-openapi.yaml"],
393357
"playground": {
394358
"display": "simple"
395359
}
@@ -564,4 +528,4 @@
564528
"destination": "/management/overview"
565529
}
566530
]
567-
}
531+
}

docs/guides/examples/puppeteer.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ There's no payload required for this task so you can just click "Run test" from
205205

206206
## Proxying
207207

208-
If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service.**
208+
If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service. You must always have permission from the website owner to scrape their content.**
209209

210210
Here are a list of proxy services we recommend:
211211

docs/guides/introduction.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ Example projects are full projects with example repos you can fork and use. Thes
4545
| [Realtime Fal.ai image generation](/guides/example-projects/realtime-fal-ai) | Generate an image from a prompt using Fal.ai and show the progress of the task on the frontend using Realtime. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-fal-ai-image-generation) |
4646
| [Realtime CSV Importer](/guides/example-projects/realtime-csv-importer) | Upload a CSV file and see the progress of the task streamed to the frontend. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-csv-importer) |
4747
| [Vercel AI SDK image generator](/guides/example-projects/vercel-ai-sdk-image-generator) | Use the Vercel AI SDK to generate images from a prompt. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/vercel-ai-sdk-image-generator) |
48+
| [Python web crawler](/guides/example-projects/python-web-crawler) | Use Python, Crawl4AI and Playwright to create a headless web crawler with Trigger.dev. || [View the repo](https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai) |
4849

4950
## Example tasks
5051

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
---
2+
title: "Python headless browser web crawler example"
3+
sidebarTitle: "Python headless web crawler"
4+
description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev."
5+
---
6+
7+
import ScrapingWarning from "/snippets/web-scraping-warning.mdx";
8+
import PythonLearnMore from "/snippets/python-learn-more.mdx";
9+
10+
## Prerequisites
11+
12+
- A project with [Trigger.dev initialized](/quick-start)
13+
- [Python](https://www.python.org/) installed on your local machine
14+
15+
## Overview
16+
17+
This demo showcases how to use Trigger.dev with Python to build a web crawler that uses a headless browser to navigate websites and extract content.
18+
19+
## Features
20+
21+
- [Trigger.dev](https://trigger.dev) for background task orchestration
22+
- Our [Python build extension](/config/extensions/pythonExtension) to install the dependencies and run the Python script
23+
- [Crawl4AI](https://github.com/unclecode/crawl4ai), an open source LLM friendly web crawler
24+
- A custom [Playwright extension](https://playwright.dev/) to create a headless chromium browser
25+
26+
<ScrapingWarning />
27+
28+
## GitHub repo
29+
30+
<Card
31+
title="View the project on GitHub"
32+
icon="GitHub"
33+
href="https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai"
34+
>
35+
Click here to view the full code for this project in our examples repository on GitHub. You can
36+
fork it and use it as a starting point for your own project.
37+
</Card>
38+
39+
## The code
40+
41+
### Build configuration
42+
43+
After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file:
44+
45+
```ts trigger.config.ts
46+
import { defineConfig } from "@trigger.dev/sdk/v3";
47+
import { pythonExtension } from "@trigger.dev/python/extension";
48+
import type { BuildContext, BuildExtension } from "@trigger.dev/core/v3/build";
49+
50+
export default defineConfig({
51+
project: "<project ref>",
52+
// Your other config settings...
53+
build: {
54+
extensions: [
55+
// This is required to use the Python extension
56+
pythonExtension(),
57+
// This is required to create a headless chromium browser with Playwright
58+
installPlaywrightChromium(),
59+
],
60+
},
61+
});
62+
63+
// This is a custom build extension to install Playwright and Chromium
64+
export function installPlaywrightChromium(): BuildExtension {
65+
return {
66+
name: "InstallPlaywrightChromium",
67+
onBuildComplete(context: BuildContext) {
68+
const instructions = [
69+
// Base and Chromium dependencies
70+
`RUN apt-get update && apt-get install -y --no-install-recommends \
71+
curl unzip npm libnspr4 libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \
72+
libasound2 libnss3 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
73+
libgbm1 libxkbcommon0 \
74+
&& apt-get clean && rm -rf /var/lib/apt/lists/*`,
75+
76+
// Install Playwright and Chromium
77+
`RUN npm install -g playwright`,
78+
`RUN mkdir -p /ms-playwright`,
79+
`RUN PLAYWRIGHT_BROWSERS_PATH=/ms-playwright python -m playwright install --with-deps chromium`,
80+
];
81+
82+
context.addLayer({
83+
id: "playwright",
84+
image: { instructions },
85+
deploy: {
86+
env: {
87+
PLAYWRIGHT_BROWSERS_PATH: "/ms-playwright",
88+
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1",
89+
PLAYWRIGHT_SKIP_BROWSER_VALIDATION: "1",
90+
},
91+
override: true,
92+
},
93+
});
94+
},
95+
};
96+
}
97+
```
98+
99+
Learn more about the [trigger.config.ts](/config/config-file) file including setting default retry settings, customizing the build environment, and more.
100+
101+
### Task code
102+
103+
This task uses the `python.runScript` method to run the `crawl-url.py` script with the given URL as an argument. You can see the original task in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/trigger/pythonTasks.ts).
104+
105+
```ts src/trigger/pythonTasks.ts
106+
import { logger, schemaTask, task } from "@trigger.dev/sdk/v3";
107+
import { python } from "@trigger.dev/python";
108+
import { z } from "zod";
109+
110+
export const convertUrlToMarkdown = schemaTask({
111+
id: "convert-url-to-markdown",
112+
schema: z.object({
113+
url: z.string().url(),
114+
}),
115+
run: async (payload) => {
116+
const result = await python.runScript("./src/python/crawl-url.py", [payload.url]);
117+
118+
logger.debug("convert-url-to-markdown", {
119+
url: payload.url,
120+
result,
121+
});
122+
123+
return result.stdout;
124+
},
125+
});
126+
```
127+
128+
### Add a requirements.txt file
129+
130+
Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies.
131+
132+
```txt requirements.txt
133+
crawl4ai
134+
playwright
135+
urllib3<2.0.0
136+
```
137+
138+
### The Python script
139+
140+
The Python script is a simple script using Crawl4AI that takes a URL and returns the markdown content of the page. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/python/crawl-url.py).
141+
142+
```python src/python/crawl-url.py
143+
import asyncio
144+
import sys
145+
from crawl4ai import *
146+
147+
async def main(url: str):
148+
async with AsyncWebCrawler() as crawler:
149+
result = await crawler.arun(
150+
url=url,
151+
)
152+
print(result.markdown)
153+
154+
if __name__ == "__main__":
155+
if len(sys.argv) < 2:
156+
print("Usage: python crawl-url.py <url>")
157+
sys.exit(1)
158+
url = sys.argv[1]
159+
asyncio.run(main(url))
160+
```
161+
162+
## Testing your task
163+
164+
1. Create a virtual environment `python -m venv venv`
165+
2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate`
166+
3. Install the Python dependencies `pip install -r requirements.txt`
167+
4. If you haven't already, copy your project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and and add it to the `trigger.config.ts` file.
168+
5. Run the Trigger.dev dev CLI command with with `npx trigger dev@latest dev` (it may ask you to authorize the CLI if you haven't already).
169+
6. Test the task in the dashboard, using a URL of your choice.
170+
171+
<ScrapingWarning />
172+
173+
## Deploying your task
174+
175+
Deploy the task to production using the CLI command `npx trigger.dev@latest deploy`
176+
177+
<PythonLearnMore />

docs/snippets/python-learn-more.mdx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
## Learn more about using Python with Trigger.dev
2+
3+
<Card title="Python build extension" icon="code" href="/config/extensions/pythonExtension">
4+
Learn how to use our built-in Python build extension to install dependencies and run your Python
5+
code.
6+
</Card>

0 commit comments

Comments
 (0)