|
| 1 | +--- |
| 2 | +title: "Python headless browser web crawler example" |
| 3 | +sidebarTitle: "Python headless web crawler" |
| 4 | +description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev." |
| 5 | +--- |
| 6 | + |
| 7 | +import ScrapingWarning from "/snippets/web-scraping-warning.mdx"; |
| 8 | +import PythonLearnMore from "/snippets/python-learn-more.mdx"; |
| 9 | + |
| 10 | +## Prerequisites |
| 11 | + |
| 12 | +- A project with [Trigger.dev initialized](/quick-start) |
| 13 | +- [Python](https://www.python.org/) installed on your local machine |
| 14 | + |
| 15 | +## Overview |
| 16 | + |
| 17 | +This demo showcases how to use Trigger.dev with Python to build a web crawler that uses a headless browser to navigate websites and extract content. |
| 18 | + |
| 19 | +## Features |
| 20 | + |
| 21 | +- [Trigger.dev](https://trigger.dev) for background task orchestration |
| 22 | +- Our [Python build extension](/config/extensions/pythonExtension) to install the dependencies and run the Python script |
| 23 | +- [Crawl4AI](https://github.com/unclecode/crawl4ai), an open source LLM friendly web crawler |
| 24 | +- A custom [Playwright extension](https://playwright.dev/) to create a headless chromium browser |
| 25 | + |
| 26 | +<ScrapingWarning /> |
| 27 | + |
| 28 | +## GitHub repo |
| 29 | + |
| 30 | +<Card |
| 31 | + title="View the project on GitHub" |
| 32 | + icon="GitHub" |
| 33 | + href="https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai" |
| 34 | +> |
| 35 | + Click here to view the full code for this project in our examples repository on GitHub. You can |
| 36 | + fork it and use it as a starting point for your own project. |
| 37 | +</Card> |
| 38 | + |
| 39 | +## The code |
| 40 | + |
| 41 | +### Build configuration |
| 42 | + |
| 43 | +After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file: |
| 44 | + |
| 45 | +```ts trigger.config.ts |
| 46 | +import { defineConfig } from "@trigger.dev/sdk/v3"; |
| 47 | +import { pythonExtension } from "@trigger.dev/python/extension"; |
| 48 | +import type { BuildContext, BuildExtension } from "@trigger.dev/core/v3/build"; |
| 49 | + |
| 50 | +export default defineConfig({ |
| 51 | + project: "<project ref>", |
| 52 | + // Your other config settings... |
| 53 | + build: { |
| 54 | + extensions: [ |
| 55 | + // This is required to use the Python extension |
| 56 | + pythonExtension(), |
| 57 | + // This is required to create a headless chromium browser with Playwright |
| 58 | + installPlaywrightChromium(), |
| 59 | + ], |
| 60 | + }, |
| 61 | +}); |
| 62 | + |
| 63 | +// This is a custom build extension to install Playwright and Chromium |
| 64 | +export function installPlaywrightChromium(): BuildExtension { |
| 65 | + return { |
| 66 | + name: "InstallPlaywrightChromium", |
| 67 | + onBuildComplete(context: BuildContext) { |
| 68 | + const instructions = [ |
| 69 | + // Base and Chromium dependencies |
| 70 | + `RUN apt-get update && apt-get install -y --no-install-recommends \ |
| 71 | + curl unzip npm libnspr4 libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \ |
| 72 | + libasound2 libnss3 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \ |
| 73 | + libgbm1 libxkbcommon0 \ |
| 74 | + && apt-get clean && rm -rf /var/lib/apt/lists/*`, |
| 75 | + |
| 76 | + // Install Playwright and Chromium |
| 77 | + `RUN npm install -g playwright`, |
| 78 | + `RUN mkdir -p /ms-playwright`, |
| 79 | + `RUN PLAYWRIGHT_BROWSERS_PATH=/ms-playwright python -m playwright install --with-deps chromium`, |
| 80 | + ]; |
| 81 | + |
| 82 | + context.addLayer({ |
| 83 | + id: "playwright", |
| 84 | + image: { instructions }, |
| 85 | + deploy: { |
| 86 | + env: { |
| 87 | + PLAYWRIGHT_BROWSERS_PATH: "/ms-playwright", |
| 88 | + PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1", |
| 89 | + PLAYWRIGHT_SKIP_BROWSER_VALIDATION: "1", |
| 90 | + }, |
| 91 | + override: true, |
| 92 | + }, |
| 93 | + }); |
| 94 | + }, |
| 95 | + }; |
| 96 | +} |
| 97 | +``` |
| 98 | + |
| 99 | +Learn more about the [trigger.config.ts](/config/config-file) file including setting default retry settings, customizing the build environment, and more. |
| 100 | + |
| 101 | +### Task code |
| 102 | + |
| 103 | +This task uses the `python.runScript` method to run the `crawl-url.py` script with the given URL as an argument. You can see the original task in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/trigger/pythonTasks.ts). |
| 104 | + |
| 105 | +```ts src/trigger/pythonTasks.ts |
| 106 | +import { logger, schemaTask, task } from "@trigger.dev/sdk/v3"; |
| 107 | +import { python } from "@trigger.dev/python"; |
| 108 | +import { z } from "zod"; |
| 109 | + |
| 110 | +export const convertUrlToMarkdown = schemaTask({ |
| 111 | + id: "convert-url-to-markdown", |
| 112 | + schema: z.object({ |
| 113 | + url: z.string().url(), |
| 114 | + }), |
| 115 | + run: async (payload) => { |
| 116 | + const result = await python.runScript("./src/python/crawl-url.py", [payload.url]); |
| 117 | + |
| 118 | + logger.debug("convert-url-to-markdown", { |
| 119 | + url: payload.url, |
| 120 | + result, |
| 121 | + }); |
| 122 | + |
| 123 | + return result.stdout; |
| 124 | + }, |
| 125 | +}); |
| 126 | +``` |
| 127 | + |
| 128 | +### Add a requirements.txt file |
| 129 | + |
| 130 | +Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies. |
| 131 | + |
| 132 | +```txt requirements.txt |
| 133 | +crawl4ai |
| 134 | +playwright |
| 135 | +urllib3<2.0.0 |
| 136 | +``` |
| 137 | + |
| 138 | +### The Python script |
| 139 | + |
| 140 | +The Python script is a simple script using Crawl4AI that takes a URL and returns the markdown content of the page. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/python/crawl-url.py). |
| 141 | + |
| 142 | +```python src/python/crawl-url.py |
| 143 | +import asyncio |
| 144 | +import sys |
| 145 | +from crawl4ai import * |
| 146 | + |
| 147 | +async def main(url: str): |
| 148 | + async with AsyncWebCrawler() as crawler: |
| 149 | + result = await crawler.arun( |
| 150 | + url=url, |
| 151 | + ) |
| 152 | + print(result.markdown) |
| 153 | + |
| 154 | +if __name__ == "__main__": |
| 155 | + if len(sys.argv) < 2: |
| 156 | + print("Usage: python crawl-url.py <url>") |
| 157 | + sys.exit(1) |
| 158 | + url = sys.argv[1] |
| 159 | + asyncio.run(main(url)) |
| 160 | +``` |
| 161 | + |
| 162 | +## Testing your task |
| 163 | + |
| 164 | +1. Create a virtual environment `python -m venv venv` |
| 165 | +2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate` |
| 166 | +3. Install the Python dependencies `pip install -r requirements.txt` |
| 167 | +4. If you haven't already, copy your project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and and add it to the `trigger.config.ts` file. |
| 168 | +5. Run the Trigger.dev dev CLI command with with `npx trigger dev@latest dev` (it may ask you to authorize the CLI if you haven't already). |
| 169 | +6. Test the task in the dashboard, using a URL of your choice. |
| 170 | + |
| 171 | +<ScrapingWarning /> |
| 172 | + |
| 173 | +## Deploying your task |
| 174 | + |
| 175 | +Deploy the task to production using the CLI command `npx trigger.dev@latest deploy` |
| 176 | + |
| 177 | +<PythonLearnMore /> |
0 commit comments