Skip to content

Latest commit

 

History

History
309 lines (257 loc) · 10.5 KB

File metadata and controls

309 lines (257 loc) · 10.5 KB
title Start SmartCrawler
api POST /v1/crawl
description Start a new web crawl request with AI extraction or markdown conversion

Start Crawl

POST /v1/crawl

Start a new crawl job using SmartCrawler. Choose between AI-powered extraction or cost-effective markdown conversion.


Request Body

Content-Type: application/json

Schema

{
  "url": "string",
  "prompt": "string",
  "extraction_mode": "boolean",
  "cache_website": "boolean",
  "depth": "integer",
  "breadth": "integer",
  "max_pages": "integer",
  "same_domain_only": "boolean",
  "batch_size": "integer",
  "schema": { /* JSON Schema object */ },
  "rules": {
    "exclude": ["string"],
    "include_paths": ["string"],
    "exclude_paths": ["string"],
    "same_domain": "boolean"
  },
  "sitemap": "boolean",
  "stealth": "boolean",
  "webhook_url": "string",
  "wait_ms": "integer"
}

Parameters

Parameter Type Required Default Description
url string Yes - The starting URL for the crawl
prompt string No* - Instructions for data extraction (*required when extraction_mode=true)
extraction_mode boolean No true When false, enables markdown conversion mode (NO AI/LLM processing, 2 credits per page)
cache_website boolean No false Whether to cache the website content
depth integer No 1 Maximum crawl depth
breadth integer No null Maximum number of links to crawl per depth level. If null/undefined, unlimited (default). Controls the 'width' of exploration at each depth. Useful for limiting crawl scope on large sites. Note: max_pages always takes priority. Ignored when sitemap=true.
max_pages integer No 10 Maximum number of pages to crawl
same_domain_only boolean No true Whether to crawl only the same domain
batch_size integer No 1 Number of pages to process in each batch
schema object No - JSON Schema object for structured output
rules object No - Crawl rules for filtering URLs. Object with optional fields: exclude (array of regex URL patterns), include_paths (array of path patterns to include, supports wildcards * and **), exclude_paths (array of path patterns to exclude, takes precedence over include_paths), same_domain (boolean, default: true). See Rules section below for details.
sitemap boolean No false Use sitemap.xml for discovery
stealth boolean No false Enable stealth mode to bypass bot protection using advanced anti-detection techniques. Adds +4 credits to the request cost
webhook_url string No None Webhook URL to send the job result to. When provided, a signed webhook notification will be sent upon job completion. See Webhook Signature Verification below.
wait_ms integer No 3000 Milliseconds to wait before scraping each page. Useful for pages with heavy JavaScript rendering that need extra time to load.

Example

{
  "url": "https://scrapegraphai.com/",
  "prompt": "What does the company do? and I need text content from there privacy and terms",
  "cache_website": true,
  "depth": 2,
  "breadth": null,
  "max_pages": 2,
  "same_domain_only": true,
  "batch_size": 1,
  "stealth": true,
  "rules": {
    "include_paths": ["/privacy", "/terms", "/about"],
    "exclude_paths": ["/admin/*", "/api/**"],
    "same_domain": true
  },
  "webhook_url":"https://api.example.com/handle/webhook"
  "schema": {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "title": "ScrapeGraphAI Website Content",
    "type": "object",
    "properties": {
      "company": {
        "type": "object",
        "properties": {
          "name": { "type": "string" },
          "description": { "type": "string" },
          "features": {
            "type": "array",
            "items": { "type": "string" }
          },
          "contact_email": { "type": "string", "format": "email" },
          "social_links": {
            "type": "object",
            "properties": {
              "github": { "type": "string", "format": "uri" },
              "linkedin": { "type": "string", "format": "uri" },
              "twitter": { "type": "string", "format": "uri" }
            },
            "additionalProperties": false
          }
        },
        "required": ["name", "description"]
      },
      "services": {
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "service_name": { "type": "string" },
            "description": { "type": "string" },
            "features": {
              "type": "array",
              "items": { "type": "string" }
            }
          },
          "required": ["service_name", "description"]
        }
      },
      "legal": {
        "type": "object",
        "properties": {
          "privacy_policy": { "type": "string" },
          "terms_of_service": { "type": "string" }
        },
        "required": ["privacy_policy", "terms_of_service"]
      }
    },
    "required": ["company", "services", "legal"]
  }
}

Markdown Conversion Example (No AI/LLM)

For cost-effective HTML to markdown conversion without AI processing:

{
  "url": "https://scrapegraphai.com/",
  "extraction_mode": false,
  "depth": 2,
  "breadth": null,
  "max_pages": 5,
  "same_domain_only": true,
  "stealth": true,
  "rules": {
    "include_paths": ["/docs/*", "/blog/**"],
    "exclude_paths": ["/admin/*", "/api/**"],
    "same_domain": true
  },
  "webhook_url":"https://api.example.com/handle/webhook"
}

Rules Example

Control which URLs to crawl using the rules object:

{
  "url": "https://example.com/",
  "prompt": "Extract product information",
  "depth": 3,
  "breadth": null,
  "max_pages": 50,
  "rules": {
    "exclude": ["https://example.com/logout", "https://example.com/admin/*"],
    "include_paths": ["/products/*", "/blog/**"],
    "exclude_paths": ["/admin/*", "/api/**", "/private/*"],
    "same_domain": true
  }
}

Rules Parameters

Field Type Default Description
exclude array [] List of URL patterns (regex) to exclude from crawling. Matches full URL.
include_paths array [] (Optional) List of path patterns to include (e.g., ["/products/*", "/blog/**"]). Supports wildcards: * matches any characters, ** matches any path segments. If empty or not specified, all paths are included.
exclude_paths array [] (Optional) List of path patterns to exclude (e.g., ["/admin/*", "/api/**"]). Supports wildcards: * matches any characters, ** matches any path segments. Takes precedence over include_paths. If empty or not specified, no paths are excluded.
same_domain boolean true Restrict crawling to same domain
Both `include_paths` and `exclude_paths` are optional. If `include_paths` is not specified or empty, all paths are included. If `exclude_paths` is not specified or empty, no paths are excluded. The `exclude_paths` patterns take precedence over `include_paths` patterns. When `extraction_mode: false`, the `prompt` parameter is not required. This mode converts HTML to clean markdown with metadata extraction at only 2 credits per page (80% savings compared to AI mode).

Response

  • 200 OK: Crawl started successfully. Returns { "task_id": "<task_id>" }. Use this task_id to retrieve the crawl result from the Get Crawl Result endpoint.
  • 422 Unprocessable Entity: Validation error.

See the Get Crawl Result endpoint for the full response structure.


Webhook Signature Verification

When you provide a webhook_url, SmartCrawler will send a POST request to your endpoint upon job completion. Each webhook request includes a signature header for verification.

Webhook Headers

Header Description
Content-Type application/json
X-Webhook-Signature Signature for verifying the webhook authenticity

Signature Format

The X-Webhook-Signature header contains your webhook secret in the format:

sha256={your_webhook_secret}

Verifying Webhooks

To verify that a webhook request is authentic:

  1. Retrieve your webhook secret from the dashboard
  2. Compare the X-Webhook-Signature header value with sha256={your_secret}
from flask import Flask, request, abort

app = Flask(__name__)
WEBHOOK_SECRET = "your_webhook_secret"

@app.route("/webhook", methods=["POST"])
def handle_webhook():
    signature = request.headers.get("X-Webhook-Signature")
    expected_signature = f"sha256={WEBHOOK_SECRET}"

    if signature != expected_signature:
        abort(401, "Invalid signature")

    # Process the webhook payload
    data = request.json
    print(f"Received webhook for job: {data['request_id']}")
    return {"status": "ok"}
const express = require('express');
const app = express();

const WEBHOOK_SECRET = 'your_webhook_secret';

app.post('/webhook', express.json(), (req, res) => {
  const signature = req.headers['x-webhook-signature'];
  const expectedSignature = `sha256=${WEBHOOK_SECRET}`;

  if (signature !== expectedSignature) {
    return res.status(401).json({ error: 'Invalid signature' });
  }

  // Process the webhook payload
  console.log(`Received webhook for job: ${req.body.request_id}`);
  res.json({ status: 'ok' });
});

Webhook Payload

The webhook POST request contains the following JSON payload:

{
  "event": "crawler_completed",
  "request_id": "uuid-of-the-crawl-job",
  "webhook_id": "uuid-of-this-webhook-delivery",
  "status": "success | failed",
  "timestamp": "2024-01-28T12:00:00Z",
  "duration_seconds": 45.2,
  "errors": [],
  "result": "..."
}
Field Type Description
event string Always "crawler_completed"
request_id string The crawl job ID (same as task_id)
webhook_id string Unique ID for this webhook delivery
status string "success" or "failed"
timestamp string ISO 8601 timestamp of completion
duration_seconds float Total job duration in seconds
errors array List of error messages (empty if successful)
result string The crawl result data (null if failed)
Make sure to configure your webhook secret in the [dashboard](https://scrapegraphai.com/dashboard) before using webhooks. Each user has a unique webhook secret for secure verification.