Skip to content

Commit c1a6f09

Browse files
VinciGit00claude
andcommitted
feat!: migrate CLI to scrapegraph-js v2 API
Aligns the CLI with scrapegraph-js PR #13 (v2 SDK). The v2 API consolidates endpoints and drops legacy ones; the CLI follows suit. Commands kept (rewritten against v2 types): - scrape — multi-format (markdown/html/screenshot/branding/links/images/summary/json) - crawl — polls until the job reaches a terminal state - history — new response shape (data/pagination), service filter optional - credits, validate — re-wired to getCredits / checkHealth Commands added: - extract — structured extraction with prompt + schema - search — web search + optional extraction - monitor — create/list/get/update/delete/pause/resume/activity Commands removed (no longer in v2 API): - smart-scraper (use `scrape -f json -p ...` or `extract`) - search-scraper (use `search`) - markdownify (use `scrape` — markdown is the default format) - sitemap, agentic-scraper, generate-schema Other changes: - package.json: scrapegraph-js pinned to github:ScrapeGraphAI/scrapegraph-js#096c110, CLI bumped 0.2.1 → 1.0.0 to track SDK v2.0.0 - src/lib/env.ts: bridges legacy SGAI_TIMEOUT_S / JUST_SCRAPE_TIMEOUT_S → SGAI_TIMEOUT (renamed by SDK v2) - README + smoke test updated Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2dc4981 commit c1a6f09

20 files changed

Lines changed: 738 additions & 581 deletions

README.md

Lines changed: 145 additions & 196 deletions
Large diffs are not rendered by default.

bun.lock

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "just-scrape",
3-
"version": "0.2.1",
3+
"version": "1.0.0",
44
"description": "ScrapeGraph AI CLI tool",
55
"type": "module",
66
"main": "dist/cli.mjs",
@@ -28,7 +28,7 @@
2828
"chalk": "^5.4.1",
2929
"citty": "^0.1.6",
3030
"dotenv": "^17.2.4",
31-
"scrapegraph-js": "^1.0.0"
31+
"scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110"
3232
},
3333
"devDependencies": {
3434
"@biomejs/biome": "^1.9.4",

src/cli.ts

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,11 @@ const main = defineCommand({
1212
description: "ScrapeGraph AI CLI tool",
1313
},
1414
subCommands: {
15-
"smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default),
16-
"search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default),
17-
markdownify: () => import("./commands/markdownify.js").then((m) => m.default),
18-
crawl: () => import("./commands/crawl.js").then((m) => m.default),
19-
sitemap: () => import("./commands/sitemap.js").then((m) => m.default),
2015
scrape: () => import("./commands/scrape.js").then((m) => m.default),
21-
"agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default),
22-
"generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default),
16+
extract: () => import("./commands/extract.js").then((m) => m.default),
17+
search: () => import("./commands/search.js").then((m) => m.default),
18+
crawl: () => import("./commands/crawl.js").then((m) => m.default),
19+
monitor: () => import("./commands/monitor.js").then((m) => m.default),
2320
history: () => import("./commands/history.js").then((m) => m.default),
2421
credits: () => import("./commands/credits.js").then((m) => m.default),
2522
validate: () => import("./commands/validate.js").then((m) => m.default),

src/commands/agentic-scraper.ts

Lines changed: 0 additions & 51 deletions
This file was deleted.

src/commands/crawl.ts

Lines changed: 89 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,115 @@
11
import { defineCommand } from "citty";
2-
import * as scrapegraphai from "scrapegraph-js";
2+
import { crawl } from "scrapegraph-js";
3+
import type { ApiCrawlRequest, ApiScrapeFormatEntry } from "scrapegraph-js";
34
import { resolveApiKey } from "../lib/folders.js";
45
import * as log from "../lib/log.js";
56

7+
const FORMATS = [
8+
"markdown",
9+
"html",
10+
"screenshot",
11+
"branding",
12+
"links",
13+
"images",
14+
"summary",
15+
] as const;
16+
type Format = (typeof FORMATS)[number];
17+
18+
const POLL_INTERVAL_MS = 3000;
19+
20+
function buildFormat(f: Format): ApiScrapeFormatEntry {
21+
if (f === "markdown" || f === "html") return { type: f, mode: "normal" };
22+
return { type: f } as ApiScrapeFormatEntry;
23+
}
24+
625
export default defineCommand({
726
meta: {
827
name: "crawl",
9-
description: "Crawl and extract data from multiple pages",
28+
description: "Crawl pages starting from a URL",
1029
},
1130
args: {
1231
url: {
1332
type: "positional",
14-
description: "Starting URL to crawl",
33+
description: "Starting URL",
1534
required: true,
1635
},
17-
prompt: {
36+
format: {
37+
type: "string",
38+
alias: "f",
39+
description: `Per-page format(s), comma-separated: ${FORMATS.join(", ")} (default: markdown)`,
40+
},
41+
"max-pages": { type: "string", description: "Maximum pages to crawl (default 50, max 1000)" },
42+
"max-depth": { type: "string", description: "Crawl depth (default 2)" },
43+
"max-links-per-page": { type: "string", description: "Max links per page (default 10)" },
44+
"allow-external": { type: "boolean", description: "Allow crawling external domains" },
45+
"include-patterns": {
1846
type: "string",
19-
alias: "p",
20-
description: "Extraction prompt (required when extraction mode is on)",
47+
description: "JSON array of regex patterns to include",
2148
},
22-
"no-extraction": {
23-
type: "boolean",
24-
description: "Return markdown only (2 credits/page instead of 10)",
49+
"exclude-patterns": {
50+
type: "string",
51+
description: "JSON array of regex patterns to exclude",
2552
},
26-
"max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" },
27-
depth: { type: "string", description: "Crawl depth (default 1)" },
28-
schema: { type: "string", description: "Output JSON schema (as JSON string)" },
29-
rules: { type: "string", description: "Crawl rules as JSON object string" },
30-
"no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" },
31-
stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" },
53+
mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" },
54+
stealth: { type: "boolean", description: "Enable stealth mode" },
3255
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
3356
},
3457
run: async ({ args }) => {
3558
const out = log.create(!!args.json);
36-
out.docs("https://docs.scrapegraphai.com/services/smartcrawler");
37-
const key = await resolveApiKey(!!args.json);
38-
39-
const base: Record<string, unknown> = { url: args.url };
40-
if (args["max-pages"]) base.max_pages = Number(args["max-pages"]);
41-
if (args.depth) base.depth = Number(args.depth);
42-
if (args.rules) base.rules = JSON.parse(args.rules);
43-
if (args["no-sitemap"]) base.sitemap = false;
44-
if (args.stealth) base.stealth = true;
45-
46-
if (args["no-extraction"]) {
47-
base.extraction_mode = false;
48-
} else {
49-
if (args.prompt) base.prompt = args.prompt;
50-
if (args.schema) base.schema = JSON.parse(args.schema);
59+
out.docs("https://docs.scrapegraphai.com/api-reference/crawl");
60+
const apiKey = await resolveApiKey(!!args.json);
61+
62+
const requested = (args.format ?? "markdown")
63+
.split(",")
64+
.map((f) => f.trim())
65+
.filter(Boolean);
66+
for (const f of requested) {
67+
if (!FORMATS.includes(f as Format))
68+
out.error(`Unknown format: ${f}. Valid: ${FORMATS.join(", ")}`);
5169
}
70+
const formats = requested.map((f) => buildFormat(f as Format));
5271

53-
const params = base as scrapegraphai.CrawlParams;
72+
const params: ApiCrawlRequest = { url: args.url, formats };
73+
const mut = params as Record<string, unknown>;
74+
if (args["max-pages"]) mut.maxPages = Number(args["max-pages"]);
75+
if (args["max-depth"]) mut.maxDepth = Number(args["max-depth"]);
76+
if (args["max-links-per-page"]) mut.maxLinksPerPage = Number(args["max-links-per-page"]);
77+
if (args["allow-external"]) mut.allowExternal = true;
78+
if (args["include-patterns"]) mut.includePatterns = JSON.parse(args["include-patterns"]);
79+
if (args["exclude-patterns"]) mut.excludePatterns = JSON.parse(args["exclude-patterns"]);
5480

55-
out.start("Crawling");
56-
const result = await scrapegraphai.crawl(key, params, out.poll);
57-
out.stop(result.elapsedMs);
81+
const fetchConfig: Record<string, unknown> = {};
82+
if (args.mode) fetchConfig.mode = args.mode;
83+
if (args.stealth) fetchConfig.stealth = true;
84+
if (Object.keys(fetchConfig).length > 0) mut.fetchConfig = fetchConfig;
5885

59-
if (result.data) out.result(result.data);
60-
else out.error(result.error);
86+
out.start("Starting crawl");
87+
const job = await crawl.start(apiKey, params);
88+
if (!job.data) {
89+
out.error(job.error);
90+
return;
91+
}
92+
const jobId = job.data.id;
93+
let totalElapsed = job.elapsedMs;
94+
95+
while (true) {
96+
await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
97+
const status = await crawl.get(apiKey, jobId);
98+
totalElapsed += status.elapsedMs;
99+
if (!status.data) {
100+
out.error(status.error);
101+
return;
102+
}
103+
out.poll(`${status.data.status} (${status.data.finished}/${status.data.total})`);
104+
if (
105+
status.data.status === "completed" ||
106+
status.data.status === "failed" ||
107+
status.data.status === "deleted"
108+
) {
109+
out.stop(totalElapsed);
110+
out.result(status.data);
111+
return;
112+
}
113+
}
61114
},
62115
});

src/commands/credits.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { defineCommand } from "citty";
2-
import * as scrapegraphai from "scrapegraph-js";
2+
import { getCredits } from "scrapegraph-js";
33
import { resolveApiKey } from "../lib/folders.js";
44
import * as log from "../lib/log.js";
55

@@ -13,10 +13,10 @@ export default defineCommand({
1313
},
1414
run: async ({ args }) => {
1515
const out = log.create(!!args.json);
16-
const key = await resolveApiKey(!!args.json);
16+
const apiKey = await resolveApiKey(!!args.json);
1717

1818
out.start("Fetching credits");
19-
const result = await scrapegraphai.getCredits(key);
19+
const result = await getCredits(apiKey);
2020
out.stop(result.elapsedMs);
2121

2222
if (result.data) out.result(result.data);

src/commands/extract.ts

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import { defineCommand } from "citty";
2+
import { extract } from "scrapegraph-js";
3+
import type { ApiExtractRequest } from "scrapegraph-js";
4+
import { resolveApiKey } from "../lib/folders.js";
5+
import * as log from "../lib/log.js";
6+
7+
export default defineCommand({
8+
meta: {
9+
name: "extract",
10+
description: "Extract structured data from a URL using AI",
11+
},
12+
args: {
13+
url: {
14+
type: "positional",
15+
description: "Website URL to extract from",
16+
required: true,
17+
},
18+
prompt: {
19+
type: "string",
20+
alias: "p",
21+
description: "Extraction prompt",
22+
required: true,
23+
},
24+
schema: { type: "string", description: "Output JSON schema (JSON string)" },
25+
mode: { type: "string", description: "Fetch mode: auto (default), fast, js" },
26+
"html-mode": {
27+
type: "string",
28+
description: "HTML processing mode: normal (default), reader, prune",
29+
},
30+
stealth: { type: "boolean", description: "Enable stealth mode" },
31+
scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" },
32+
cookies: { type: "string", description: "Cookies as JSON object string" },
33+
headers: { type: "string", description: "Custom headers as JSON object string" },
34+
country: { type: "string", description: "ISO country code for geo-targeting" },
35+
json: { type: "boolean", description: "Output raw JSON (pipeable)" },
36+
},
37+
run: async ({ args }) => {
38+
const out = log.create(!!args.json);
39+
out.docs("https://docs.scrapegraphai.com/api-reference/extract");
40+
const apiKey = await resolveApiKey(!!args.json);
41+
42+
const fetchConfig: Record<string, unknown> = {};
43+
if (args.mode) fetchConfig.mode = args.mode;
44+
if (args.stealth) fetchConfig.stealth = true;
45+
if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls);
46+
if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies);
47+
if (args.headers) fetchConfig.headers = JSON.parse(args.headers);
48+
if (args.country) fetchConfig.country = args.country;
49+
50+
const params: ApiExtractRequest = { url: args.url, prompt: args.prompt };
51+
if (args.schema) (params as Record<string, unknown>).schema = JSON.parse(args.schema);
52+
if (args["html-mode"]) (params as Record<string, unknown>).mode = args["html-mode"];
53+
if (Object.keys(fetchConfig).length > 0)
54+
(params as Record<string, unknown>).fetchConfig = fetchConfig;
55+
56+
out.start("Extracting");
57+
const result = await extract(apiKey, params);
58+
out.stop(result.elapsedMs);
59+
60+
if (result.data) out.result(result.data);
61+
else out.error(result.error);
62+
},
63+
});

src/commands/generate-schema.ts

Lines changed: 0 additions & 37 deletions
This file was deleted.

0 commit comments

Comments
 (0)