| title | Start SmartCrawler |
|---|---|
| api | POST /v1/crawl |
| description | Start a new web crawl request with AI extraction or markdown conversion |
POST /v1/crawl
Start a new crawl job using SmartCrawler. Choose between AI-powered extraction or cost-effective markdown conversion.
Content-Type: application/json
{
"url": "string",
"prompt": "string",
"extraction_mode": "boolean",
"cache_website": "boolean",
"depth": "integer",
"breadth": "integer",
"max_pages": "integer",
"same_domain_only": "boolean",
"batch_size": "integer",
"schema": { /* JSON Schema object */ },
"rules": {
"exclude": ["string"],
"include_paths": ["string"],
"exclude_paths": ["string"],
"same_domain": "boolean"
},
"sitemap": "boolean",
"stealth": "boolean",
"webhook_url": "string",
"wait_ms": "integer"
}| Parameter | Type | Required | Default | Description |
|---|---|---|---|---|
| url | string | Yes | - | The starting URL for the crawl |
| prompt | string | No* | - | Instructions for data extraction (*required when extraction_mode=true) |
| extraction_mode | boolean | No | true | When false, enables markdown conversion mode (NO AI/LLM processing, 2 credits per page) |
| cache_website | boolean | No | false | Whether to cache the website content |
| depth | integer | No | 1 | Maximum crawl depth |
| breadth | integer | No | null | Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: max_pages always takes priority. Ignored when sitemap=true. |
| max_pages | integer | No | 10 | Maximum number of pages to crawl |
| same_domain_only | boolean | No | true | Whether to crawl only the same domain |
| batch_size | integer | No | 1 | Number of pages to process in each batch |
| schema | object | No | - | JSON Schema object for structured output |
| rules | object | No | - | Crawl rules for filtering URLs. Object with optional fields: exclude (array of regex URL patterns), include_paths (array of path patterns to include, supports wildcards * and **), exclude_paths (array of path patterns to exclude, takes precedence over include_paths), same_domain (boolean, default: true). See Rules section below for details. |
| sitemap | boolean | No | false | Use sitemap.xml for discovery |
| stealth | boolean | No | false | Enable stealth mode to bypass bot protection using advanced anti-detection techniques. Adds +4 credits to the request cost |
| webhook_url | string | No | None | Webhook URL to send the job result to. When provided, a signed webhook notification will be sent upon job completion. See Webhook Signature Verification below. |
| wait_ms | integer | No | 3000 | Milliseconds to wait before scraping each page. Useful for pages with heavy JavaScript rendering that need extra time to load. |
{
"url": "https://scrapegraphai.com/",
"prompt": "What does the company do? and I need text content from there privacy and terms",
"cache_website": true,
"depth": 2,
"breadth": null,
"max_pages": 2,
"same_domain_only": true,
"batch_size": 1,
"stealth": true,
"rules": {
"include_paths": ["/privacy", "/terms", "/about"],
"exclude_paths": ["/admin/*", "/api/**"],
"same_domain": true
},
"webhook_url":"https://api.example.com/handle/webhook"
"schema": {
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "ScrapeGraphAI Website Content",
"type": "object",
"properties": {
"company": {
"type": "object",
"properties": {
"name": { "type": "string" },
"description": { "type": "string" },
"features": {
"type": "array",
"items": { "type": "string" }
},
"contact_email": { "type": "string", "format": "email" },
"social_links": {
"type": "object",
"properties": {
"github": { "type": "string", "format": "uri" },
"linkedin": { "type": "string", "format": "uri" },
"twitter": { "type": "string", "format": "uri" }
},
"additionalProperties": false
}
},
"required": ["name", "description"]
},
"services": {
"type": "array",
"items": {
"type": "object",
"properties": {
"service_name": { "type": "string" },
"description": { "type": "string" },
"features": {
"type": "array",
"items": { "type": "string" }
}
},
"required": ["service_name", "description"]
}
},
"legal": {
"type": "object",
"properties": {
"privacy_policy": { "type": "string" },
"terms_of_service": { "type": "string" }
},
"required": ["privacy_policy", "terms_of_service"]
}
},
"required": ["company", "services", "legal"]
}
}For cost-effective HTML to markdown conversion without AI processing:
{
"url": "https://scrapegraphai.com/",
"extraction_mode": false,
"depth": 2,
"breadth": null,
"max_pages": 5,
"same_domain_only": true,
"stealth": true,
"rules": {
"include_paths": ["/docs/*", "/blog/**"],
"exclude_paths": ["/admin/*", "/api/**"],
"same_domain": true
},
"webhook_url":"https://api.example.com/handle/webhook"
}Control which URLs to crawl using the rules object:
{
"url": "https://example.com/",
"prompt": "Extract product information",
"depth": 3,
"breadth": null,
"max_pages": 50,
"rules": {
"exclude": ["https://example.com/logout", "https://example.com/admin/*"],
"include_paths": ["/products/*", "/blog/**"],
"exclude_paths": ["/admin/*", "/api/**", "/private/*"],
"same_domain": true
}
}| Field | Type | Default | Description |
|---|---|---|---|
| exclude | array | [] | List of URL patterns (regex) to exclude from crawling. Matches full URL. |
| include_paths | array | [] | (Optional) List of path patterns to include (e.g., ["/products/*", "/blog/**"]). Supports wildcards: * matches any characters, ** matches any path segments. If empty or not specified, all paths are included. |
| exclude_paths | array | [] | (Optional) List of path patterns to exclude (e.g., ["/admin/*", "/api/**"]). Supports wildcards: * matches any characters, ** matches any path segments. Takes precedence over include_paths. If empty or not specified, no paths are excluded. |
| same_domain | boolean | true | Restrict crawling to same domain |
- 200 OK: Crawl started successfully. Returns
{ "task_id": "<task_id>" }. Use thistask_idto retrieve the crawl result from the Get Crawl Result endpoint. - 422 Unprocessable Entity: Validation error.
See the Get Crawl Result endpoint for the full response structure.
When you provide a webhook_url, SmartCrawler will send a POST request to your endpoint upon job completion. Each webhook request includes a signature header for verification.
| Header | Description |
|---|---|
| Content-Type | application/json |
| X-Webhook-Signature | Signature for verifying the webhook authenticity |
The X-Webhook-Signature header contains your webhook secret in the format:
sha256={your_webhook_secret}
To verify that a webhook request is authentic:
- Retrieve your webhook secret from the dashboard
- Compare the
X-Webhook-Signatureheader value withsha256={your_secret}
from flask import Flask, request, abort
app = Flask(__name__)
WEBHOOK_SECRET = "your_webhook_secret"
@app.route("/webhook", methods=["POST"])
def handle_webhook():
signature = request.headers.get("X-Webhook-Signature")
expected_signature = f"sha256={WEBHOOK_SECRET}"
if signature != expected_signature:
abort(401, "Invalid signature")
# Process the webhook payload
data = request.json
print(f"Received webhook for job: {data['request_id']}")
return {"status": "ok"}const express = require('express');
const app = express();
const WEBHOOK_SECRET = 'your_webhook_secret';
app.post('/webhook', express.json(), (req, res) => {
const signature = req.headers['x-webhook-signature'];
const expectedSignature = `sha256=${WEBHOOK_SECRET}`;
if (signature !== expectedSignature) {
return res.status(401).json({ error: 'Invalid signature' });
}
// Process the webhook payload
console.log(`Received webhook for job: ${req.body.request_id}`);
res.json({ status: 'ok' });
});The webhook POST request contains the following JSON payload:
{
"event": "crawler_completed",
"request_id": "uuid-of-the-crawl-job",
"webhook_id": "uuid-of-this-webhook-delivery",
"status": "success | failed",
"timestamp": "2024-01-28T12:00:00Z",
"duration_seconds": 45.2,
"errors": [],
"result": "..."
}| Field | Type | Description |
|---|---|---|
| event | string | Always "crawler_completed" |
| request_id | string | The crawl job ID (same as task_id) |
| webhook_id | string | Unique ID for this webhook delivery |
| status | string | "success" or "failed" |
| timestamp | string | ISO 8601 timestamp of completion |
| duration_seconds | float | Total job duration in seconds |
| errors | array | List of error messages (empty if successful) |
| result | string | The crawl result data (null if failed) |