Added Python crawling (slithering?) example

D-K-P · D-K-P · commit a1ea09d6e2fe · 2025-03-11T11:54:28.000Z
diff --git a/docs/docs.json b/docs/docs.json
@@ -18,24 +18,14 @@
         "groups": [
           {
             "group": "Getting started",
-            "pages": [
-              "introduction",
-              "quick-start",
-              "video-walkthrough",
-              "how-it-works",
-              "limits"
-            ]
+            "pages": ["introduction", "quick-start", "video-walkthrough", "how-it-works", "limits"]
           },
           {
             "group": "Fundamentals",
             "pages": [
               {
                 "group": "Tasks",
-                "pages": [
-                  "tasks/overview",
-                  "tasks/schemaTask",
-                  "tasks/scheduled"
-                ]
+                "pages": ["tasks/overview", "tasks/schemaTask", "tasks/scheduled"]
               },
               "triggering",
               "runs",
@@ -50,13 +40,7 @@
               "errors-retrying",
               {
                 "group": "Wait",
-                "pages": [
-                  "wait",
-                  "wait-for",
-                  "wait-until",
-                  "wait-for-event",
-                  "wait-for-request"
-                ]
+                "pages": ["wait", "wait-for", "wait-until", "wait-for-event", "wait-for-request"]
               },
               "queue-concurrency",
               "versioning",
@@ -100,9 +84,7 @@
           },
           {
             "group": "Development",
-            "pages": [
-              "cli-dev"
-            ]
+            "pages": ["cli-dev"]
           },
           {
             "group": "Deployment",
@@ -113,9 +95,7 @@
               "deployment/atomic-deployment",
               {
                 "group": "Deployment integrations",
-                "pages": [
-                  "vercel-integration"
-                ]
+                "pages": ["vercel-integration"]
               }
             ]
           },
@@ -166,12 +146,7 @@
           },
           {
             "group": "Using the Dashboard",
-            "pages": [
-              "run-tests",
-              "troubleshooting-alerts",
-              "replaying",
-              "bulk-actions"
-            ]
+            "pages": ["run-tests", "troubleshooting-alerts", "replaying", "bulk-actions"]
           },
           {
             "group": "Troubleshooting",
@@ -197,11 +172,7 @@
           },
           {
             "group": "Help",
-            "pages": [
-              "community",
-              "help-slack",
-              "help-email"
-            ]
+            "pages": ["community", "help-slack", "help-email"]
           }
         ]
       },
@@ -222,10 +193,7 @@
           },
           {
             "group": "Tasks API",
-            "pages": [
-              "management/tasks/trigger",
-              "management/tasks/batch-trigger"
-            ]
+            "pages": ["management/tasks/trigger", "management/tasks/batch-trigger"]
           },
           {
             "group": "Runs API",
@@ -271,9 +239,7 @@
         "groups": [
           {
             "group": "Introduction",
-            "pages": [
-              "guides/introduction"
-            ]
+            "pages": ["guides/introduction"]
           },
           {
             "group": "Frameworks",
@@ -340,7 +306,8 @@
               "guides/example-projects/claude-thinking-chatbot",
               "guides/example-projects/realtime-fal-ai",
               "guides/example-projects/realtime-csv-importer",
-              "guides/example-projects/vercel-ai-sdk-image-generator"
+              "guides/example-projects/vercel-ai-sdk-image-generator",
+              "guides/python/python-crawl4ai"
             ]
           },
           {
@@ -386,10 +353,7 @@
     "href": "https://trigger.dev"
   },
   "api": {
-    "openapi": [
-      "openapi.yml",
-      "v3-openapi.yaml"
-    ],
+    "openapi": ["openapi.yml", "v3-openapi.yaml"],
     "playground": {
       "display": "simple"
     }
@@ -564,4 +528,4 @@
       "destination": "/management/overview"
     }
   ]
-}
+}
diff --git a/docs/guides/examples/puppeteer.mdx b/docs/guides/examples/puppeteer.mdx
@@ -205,7 +205,7 @@ There's no payload required for this task so you can just click "Run test" from
 
 ## Proxying
 
-If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service.**
+If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service. You must always have permission from the website owner to scrape their content.**
 
 Here are a list of proxy services we recommend:
 
diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx
@@ -45,6 +45,7 @@ Example projects are full projects with example repos you can fork and use. Thes
 | [Realtime Fal.ai image generation](/guides/example-projects/realtime-fal-ai)            | Generate an image from a prompt using Fal.ai and show the progress of the task on the frontend using Realtime. | Next.js   | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-fal-ai-image-generation) |
 | [Realtime CSV Importer](/guides/example-projects/realtime-csv-importer)                 | Upload a CSV file and see the progress of the task streamed to the frontend.                                   | Next.js   | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-csv-importer)            |
 | [Vercel AI SDK image generator](/guides/example-projects/vercel-ai-sdk-image-generator) | Use the Vercel AI SDK to generate images from a prompt.                                                        | Next.js   | [View the repo](https://github.com/triggerdotdev/examples/tree/main/vercel-ai-sdk-image-generator)    |
+| [Python web crawler](/guides/example-projects/python-web-crawler)                       | Use Python, Crawl4AI and Playwright to create a headless web crawler with Trigger.dev.                         | —         | [View the repo](https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai)                  |
 
 ## Example tasks
 
diff --git a/docs/guides/python/python-crawl4ai.mdx b/docs/guides/python/python-crawl4ai.mdx
@@ -0,0 +1,177 @@
+---
+title: "Python headless browser web crawler example"
+sidebarTitle: "Python headless web crawler"
+description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev."
+---
+
+import ScrapingWarning from "/snippets/web-scraping-warning.mdx";
+import PythonLearnMore from "/snippets/python-learn-more.mdx";
+
+## Prerequisites
+
+- A project with [Trigger.dev initialized](/quick-start)
+- [Python](https://www.python.org/) installed on your local machine
+
+## Overview
+
+This demo showcases how to use Trigger.dev with Python to build a web crawler that uses a headless browser to navigate websites and extract content.
+
+## Features
+
+- [Trigger.dev](https://trigger.dev) for background task orchestration
+- Our [Python build extension](/config/extensions/pythonExtension) to install the dependencies and run the Python script
+- [Crawl4AI](https://github.com/unclecode/crawl4ai), an open source LLM friendly web crawler
+- A custom [Playwright extension](https://playwright.dev/) to create a headless chromium browser
+
+<ScrapingWarning />
+
+## GitHub repo
+
+<Card
+  title="View the project on GitHub"
+  icon="GitHub"
+  href="https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai"
+>
+  Click here to view the full code for this project in our examples repository on GitHub. You can
+  fork it and use it as a starting point for your own project.
+</Card>
+
+## The code
+
+### Build configuration
+
+After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file:
+
+```ts trigger.config.ts
+import { defineConfig } from "@trigger.dev/sdk/v3";
+import { pythonExtension } from "@trigger.dev/python/extension";
+import type { BuildContext, BuildExtension } from "@trigger.dev/core/v3/build";
+
+export default defineConfig({
+  project: "<project ref>",
+  // Your other config settings...
+  build: {
+    extensions: [
+      // This is required to use the Python extension
+      pythonExtension(),
+      // This is required to create a headless chromium browser with Playwright
+      installPlaywrightChromium(),
+    ],
+  },
+});
+
+// This is a custom build extension to install Playwright and Chromium
+export function installPlaywrightChromium(): BuildExtension {
+  return {
+    name: "InstallPlaywrightChromium",
+    onBuildComplete(context: BuildContext) {
+      const instructions = [
+        // Base and Chromium dependencies
+        `RUN apt-get update && apt-get install -y --no-install-recommends \
+          curl unzip npm libnspr4 libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \
+          libasound2 libnss3 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
+          libgbm1 libxkbcommon0 \
+          && apt-get clean && rm -rf /var/lib/apt/lists/*`,
+
+        // Install Playwright and Chromium
+        `RUN npm install -g playwright`,
+        `RUN mkdir -p /ms-playwright`,
+        `RUN PLAYWRIGHT_BROWSERS_PATH=/ms-playwright python -m playwright install --with-deps chromium`,
+      ];
+
+      context.addLayer({
+        id: "playwright",
+        image: { instructions },
+        deploy: {
+          env: {
+            PLAYWRIGHT_BROWSERS_PATH: "/ms-playwright",
+            PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1",
+            PLAYWRIGHT_SKIP_BROWSER_VALIDATION: "1",
+          },
+          override: true,
+        },
+      });
+    },
+  };
+}
+```
+
+Learn more about the [trigger.config.ts](/config/config-file) file including setting default retry settings, customizing the build environment, and more.
+
+### Task code
+
+This task uses the `python.runScript` method to run the `crawl-url.py` script with the given URL as an argument. You can see the original task in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/trigger/pythonTasks.ts).
+
+```ts src/trigger/pythonTasks.ts
+import { logger, schemaTask, task } from "@trigger.dev/sdk/v3";
+import { python } from "@trigger.dev/python";
+import { z } from "zod";
+
+export const convertUrlToMarkdown = schemaTask({
+  id: "convert-url-to-markdown",
+  schema: z.object({
+    url: z.string().url(),
+  }),
+  run: async (payload) => {
+    const result = await python.runScript("./src/python/crawl-url.py", [payload.url]);
+
+    logger.debug("convert-url-to-markdown", {
+      url: payload.url,
+      result,
+    });
+
+    return result.stdout;
+  },
+});
+```
+
+### Add a requirements.txt file
+
+Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies.
+
+```txt requirements.txt
+crawl4ai
+playwright
+urllib3<2.0.0
+```
+
+### The Python script
+
+The Python script is a simple script using Crawl4AI that takes a URL and returns the markdown content of the page. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/python/crawl-url.py).
+
+```python src/python/crawl-url.py
+import asyncio
+import sys
+from crawl4ai import *
+
+async def main(url: str):
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+        )
+        print(result.markdown)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python crawl-url.py <url>")
+        sys.exit(1)
+    url = sys.argv[1]
+    asyncio.run(main(url))
+```
+
+## Testing your task
+
+1. Create a virtual environment `python -m venv venv`
+2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate`
+3. Install the Python dependencies `pip install -r requirements.txt`
+4. If you haven't already, copy your project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and and add it to the `trigger.config.ts` file.
+5. Run the Trigger.dev dev CLI command with with `npx trigger dev@latest dev` (it may ask you to authorize the CLI if you haven't already).
+6. Test the task in the dashboard, using a URL of your choice.
+
+<ScrapingWarning />
+
+## Deploying your task
+
+Deploy the task to production using the CLI command `npx trigger.dev@latest deploy`
+
+<PythonLearnMore />
diff --git a/docs/snippets/python-learn-more.mdx b/docs/snippets/python-learn-more.mdx
@@ -0,0 +1,6 @@
+## Learn more about using Python with Trigger.dev
+
+<Card title="Python build extension" icon="code" href="/config/extensions/pythonExtension">
+  Learn how to use our built-in Python build extension to install dependencies and run your Python
+  code.
+</Card>