Skip to content

Commit 60e99b6

Browse files
feat: implement v2 SDK matching JS API 1:1
- Add ScrapeGraphAI sync client with httpx - Add AsyncScrapeGraphAI async client - Add Pydantic models for all request/response types - Add nested resources: crawl, monitor, history - Return ApiResult wrapper (never raises) - Support SGAI_API_KEY, SGAI_DEBUG, SGAI_TIMEOUT_S env vars API surface: - client.scrape(ScrapeRequest) - client.extract(ExtractRequest) - client.search(SearchRequest) - client.credits() - client.health() - client.crawl.start/get/stop/resume/delete - client.monitor.create/list/get/update/delete/pause/resume - client.history.list/get Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 731d70d commit 60e99b6

8 files changed

Lines changed: 1473 additions & 9 deletions

File tree

pyproject.toml

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,46 @@
11
[project]
22
name = "scrapegraph-py"
3-
version = "0.1.0"
4-
description = "Add your description here"
3+
version = "2.0.0"
4+
description = "Official Python SDK for ScrapeGraph AI API"
55
readme = "README.md"
6+
license = "MIT"
67
authors = [
7-
{ name = "FrancescoSaverioZuppichini", email = "francesco.zuppichini@gmail.com" }
8+
{ name = "ScrapeGraph AI", email = "support@scrapegraphai.com" }
89
]
9-
requires-python = ">=3.13"
10-
dependencies = []
10+
requires-python = ">=3.12"
11+
dependencies = [
12+
"httpx>=0.27.0",
13+
"pydantic>=2.0.0",
14+
]
15+
keywords = ["scraping", "ai", "web-scraping", "api", "sdk"]
16+
classifiers = [
17+
"Development Status :: 4 - Beta",
18+
"Intended Audience :: Developers",
19+
"License :: OSI Approved :: MIT License",
20+
"Programming Language :: Python :: 3",
21+
"Programming Language :: Python :: 3.12",
22+
"Programming Language :: Python :: 3.13",
23+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
24+
"Topic :: Software Development :: Libraries :: Python Modules",
25+
"Typing :: Typed",
26+
]
27+
28+
[project.urls]
29+
Homepage = "https://scrapegraphai.com"
30+
Documentation = "https://docs.scrapegraphai.com"
31+
Repository = "https://github.com/ScrapeGraphAI/scrapegraph-py"
32+
Issues = "https://github.com/ScrapeGraphAI/scrapegraph-py/issues"
1133

1234
[build-system]
13-
requires = ["uv_build>=0.9.18,<0.10.0"]
14-
build-backend = "uv_build"
35+
requires = ["hatchling"]
36+
build-backend = "hatchling.build"
37+
38+
[tool.hatch.build.targets.wheel]
39+
packages = ["src/scrapegraph_py"]
40+
41+
[dependency-groups]
42+
dev = [
43+
"pytest>=8.0.0",
44+
"pytest-asyncio>=0.23.0",
45+
"ruff>=0.4.0",
46+
]

src/scrapegraph_py/__init__.py

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,79 @@
1-
def hello() -> str:
2-
return "Hello from scrapegraph-py!"
1+
from .client import ScrapeGraphAI
2+
from .async_client import AsyncScrapeGraphAI
3+
4+
from .types import (
5+
ApiResult,
6+
ApiScrapeResponse,
7+
ApiExtractResponse,
8+
ApiSearchResponse,
9+
ApiCrawlResponse,
10+
ApiCrawlResult,
11+
ApiCrawlPage,
12+
ApiMonitorResponse,
13+
ApiMonitorResult,
14+
ApiMonitorDiffs,
15+
ApiHistoryPage,
16+
ApiHistoryEntry,
17+
ApiCreditsResponse,
18+
ApiHealthResponse,
19+
ApiTokenUsage,
20+
ApiSearchResult,
21+
ApiBranding,
22+
)
23+
24+
from .schemas import (
25+
ScrapeRequest,
26+
ExtractRequest,
27+
SearchRequest,
28+
CrawlRequest,
29+
MonitorCreateRequest,
30+
MonitorUpdateRequest,
31+
HistoryFilter,
32+
FetchConfig,
33+
MarkdownFormatConfig,
34+
HtmlFormatConfig,
35+
ScreenshotFormatConfig,
36+
JsonFormatConfig,
37+
LinksFormatConfig,
38+
ImagesFormatConfig,
39+
SummaryFormatConfig,
40+
BrandingFormatConfig,
41+
)
42+
43+
__all__ = [
44+
"ScrapeGraphAI",
45+
"AsyncScrapeGraphAI",
46+
"ApiResult",
47+
"ApiScrapeResponse",
48+
"ApiExtractResponse",
49+
"ApiSearchResponse",
50+
"ApiCrawlResponse",
51+
"ApiCrawlResult",
52+
"ApiCrawlPage",
53+
"ApiMonitorResponse",
54+
"ApiMonitorResult",
55+
"ApiMonitorDiffs",
56+
"ApiHistoryPage",
57+
"ApiHistoryEntry",
58+
"ApiCreditsResponse",
59+
"ApiHealthResponse",
60+
"ApiTokenUsage",
61+
"ApiSearchResult",
62+
"ApiBranding",
63+
"ScrapeRequest",
64+
"ExtractRequest",
65+
"SearchRequest",
66+
"CrawlRequest",
67+
"MonitorCreateRequest",
68+
"MonitorUpdateRequest",
69+
"HistoryFilter",
70+
"FetchConfig",
71+
"MarkdownFormatConfig",
72+
"HtmlFormatConfig",
73+
"ScreenshotFormatConfig",
74+
"JsonFormatConfig",
75+
"LinksFormatConfig",
76+
"ImagesFormatConfig",
77+
"SummaryFormatConfig",
78+
"BrandingFormatConfig",
79+
]

src/scrapegraph_py/async_client.py

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
from __future__ import annotations
2+
import os
3+
import time
4+
import httpx
5+
from pydantic import BaseModel
6+
7+
from .env import env
8+
from .types import (
9+
ApiResult,
10+
ApiScrapeResponse,
11+
ApiExtractResponse,
12+
ApiSearchResponse,
13+
ApiCrawlResponse,
14+
ApiMonitorResponse,
15+
ApiHistoryPage,
16+
ApiHistoryEntry,
17+
ApiCreditsResponse,
18+
ApiHealthResponse,
19+
)
20+
from .schemas import (
21+
ScrapeRequest,
22+
ExtractRequest,
23+
SearchRequest,
24+
CrawlRequest,
25+
MonitorCreateRequest,
26+
MonitorUpdateRequest,
27+
HistoryFilter,
28+
)
29+
30+
31+
def _debug(label: str, data: object = None) -> None:
32+
if not env.debug:
33+
return
34+
from datetime import datetime
35+
ts = datetime.now().isoformat()
36+
if data is not None:
37+
import json
38+
print(f"[{ts}] {label}", json.dumps(data, indent=2, default=str), file=__import__("sys").stderr)
39+
else:
40+
print(f"[{ts}] {label}", file=__import__("sys").stderr)
41+
42+
43+
def _map_http_error(status: int) -> str:
44+
match status:
45+
case 401:
46+
return "Invalid or missing API key"
47+
case 402:
48+
return "Insufficient credits - purchase more at https://dashboard.scrapegraphai.com"
49+
case 422:
50+
return "Invalid parameters - check your request"
51+
case 429:
52+
return "Rate limited - slow down and retry"
53+
case 500:
54+
return "Server error - try again later"
55+
case _:
56+
return f"HTTP {status}"
57+
58+
59+
def _to_camel(s: str) -> str:
60+
parts = s.split("_")
61+
return parts[0] + "".join(p.capitalize() for p in parts[1:])
62+
63+
64+
def _serialize(model: BaseModel) -> dict:
65+
data = model.model_dump(exclude_none=True, by_alias=True)
66+
67+
def convert_keys(obj):
68+
if isinstance(obj, dict):
69+
return {_to_camel(k): convert_keys(v) for k, v in obj.items()}
70+
elif isinstance(obj, list):
71+
return [convert_keys(i) for i in obj]
72+
return obj
73+
74+
return convert_keys(data)
75+
76+
77+
class AsyncCrawlResource:
78+
def __init__(self, client: AsyncScrapeGraphAI):
79+
self._client = client
80+
81+
async def start(self, params: CrawlRequest) -> ApiResult[ApiCrawlResponse]:
82+
return await self._client._post("/crawl", params, ApiCrawlResponse)
83+
84+
async def get(self, id: str) -> ApiResult[ApiCrawlResponse]:
85+
return await self._client._get(f"/crawl/{id}", ApiCrawlResponse)
86+
87+
async def stop(self, id: str) -> ApiResult[dict]:
88+
return await self._client._post_empty(f"/crawl/{id}/stop")
89+
90+
async def resume(self, id: str) -> ApiResult[dict]:
91+
return await self._client._post_empty(f"/crawl/{id}/resume")
92+
93+
async def delete(self, id: str) -> ApiResult[dict]:
94+
return await self._client._delete(f"/crawl/{id}")
95+
96+
97+
class AsyncMonitorResource:
98+
def __init__(self, client: AsyncScrapeGraphAI):
99+
self._client = client
100+
101+
async def create(self, params: MonitorCreateRequest) -> ApiResult[ApiMonitorResponse]:
102+
return await self._client._post("/monitor", params, ApiMonitorResponse)
103+
104+
async def list(self) -> ApiResult[list[ApiMonitorResponse]]:
105+
return await self._client._get("/monitor", list[ApiMonitorResponse])
106+
107+
async def get(self, id: str) -> ApiResult[ApiMonitorResponse]:
108+
return await self._client._get(f"/monitor/{id}", ApiMonitorResponse)
109+
110+
async def update(self, id: str, params: MonitorUpdateRequest) -> ApiResult[ApiMonitorResponse]:
111+
return await self._client._patch(f"/monitor/{id}", params, ApiMonitorResponse)
112+
113+
async def delete(self, id: str) -> ApiResult[dict]:
114+
return await self._client._delete(f"/monitor/{id}")
115+
116+
async def pause(self, id: str) -> ApiResult[ApiMonitorResponse]:
117+
return await self._client._post_empty(f"/monitor/{id}/pause", ApiMonitorResponse)
118+
119+
async def resume(self, id: str) -> ApiResult[ApiMonitorResponse]:
120+
return await self._client._post_empty(f"/monitor/{id}/resume", ApiMonitorResponse)
121+
122+
123+
class AsyncHistoryResource:
124+
def __init__(self, client: AsyncScrapeGraphAI):
125+
self._client = client
126+
127+
async def list(self, params: HistoryFilter | None = None) -> ApiResult[ApiHistoryPage]:
128+
qs = {}
129+
if params:
130+
if params.page:
131+
qs["page"] = str(params.page)
132+
if params.limit:
133+
qs["limit"] = str(params.limit)
134+
if params.service:
135+
qs["service"] = params.service
136+
return await self._client._get("/history", ApiHistoryPage, params=qs if qs else None)
137+
138+
async def get(self, id: str) -> ApiResult[ApiHistoryEntry]:
139+
return await self._client._get(f"/history/{id}", ApiHistoryEntry)
140+
141+
142+
class AsyncScrapeGraphAI:
143+
def __init__(self, *, api_key: str | None = None):
144+
self._api_key = api_key or os.environ.get("SGAI_API_KEY")
145+
if not self._api_key:
146+
raise ValueError("API key required: pass api_key or set SGAI_API_KEY env var")
147+
148+
self._http = httpx.AsyncClient(
149+
base_url=env.base_url,
150+
timeout=env.timeout,
151+
headers={"SGAI-APIKEY": self._api_key},
152+
)
153+
154+
self.crawl = AsyncCrawlResource(self)
155+
self.monitor = AsyncMonitorResource(self)
156+
self.history = AsyncHistoryResource(self)
157+
158+
async def _request[T](
159+
self,
160+
method: str,
161+
path: str,
162+
response_type: type[T],
163+
body: BaseModel | None = None,
164+
params: dict | None = None,
165+
base_url: str | None = None,
166+
) -> ApiResult[T]:
167+
url = path if base_url is None else f"{base_url}{path}"
168+
json_body = _serialize(body) if body else None
169+
_debug(f"-> {method} {url}", json_body)
170+
171+
try:
172+
start = time.perf_counter()
173+
174+
if base_url:
175+
async with httpx.AsyncClient(timeout=env.timeout) as client:
176+
resp = await client.request(
177+
method,
178+
url,
179+
json=json_body,
180+
params=params,
181+
headers={"SGAI-APIKEY": self._api_key},
182+
)
183+
else:
184+
resp = await self._http.request(method, path, json=json_body, params=params)
185+
186+
server_timing = resp.headers.get("Server-Timing")
187+
if server_timing:
188+
import re
189+
match = re.search(r"dur=(\d+(?:\.\d+)?)", server_timing)
190+
elapsed_ms = int(float(match.group(1))) if match else int((time.perf_counter() - start) * 1000)
191+
else:
192+
elapsed_ms = int((time.perf_counter() - start) * 1000)
193+
194+
if not resp.is_success:
195+
error = _map_http_error(resp.status_code)
196+
try:
197+
err_body = resp.json()
198+
_debug(f"<- {resp.status_code}", err_body)
199+
if "detail" in err_body:
200+
d = err_body["detail"]
201+
detail = d if isinstance(d, str) else str(d)
202+
error = f"{error}: {detail}"
203+
except Exception:
204+
pass
205+
return ApiResult(status="error", data=None, error=error, elapsed_ms=elapsed_ms)
206+
207+
data = resp.json()
208+
_debug(f"<- {resp.status_code} ({elapsed_ms}ms)", data)
209+
return ApiResult(status="success", data=data, elapsed_ms=elapsed_ms)
210+
211+
except httpx.TimeoutException:
212+
return ApiResult(status="error", data=None, error="Request timed out", elapsed_ms=0)
213+
except Exception as e:
214+
return ApiResult(status="error", data=None, error=str(e), elapsed_ms=0)
215+
216+
async def _get[T](self, path: str, response_type: type[T], params: dict | None = None) -> ApiResult[T]:
217+
return await self._request("GET", path, response_type, params=params)
218+
219+
async def _post[T](self, path: str, body: BaseModel, response_type: type[T]) -> ApiResult[T]:
220+
return await self._request("POST", path, response_type, body=body)
221+
222+
async def _post_empty[T](self, path: str, response_type: type[T] = dict) -> ApiResult[T]:
223+
return await self._request("POST", path, response_type)
224+
225+
async def _patch[T](self, path: str, body: BaseModel, response_type: type[T]) -> ApiResult[T]:
226+
return await self._request("PATCH", path, response_type, body=body)
227+
228+
async def _delete(self, path: str) -> ApiResult[dict]:
229+
return await self._request("DELETE", path, dict)
230+
231+
async def scrape(self, params: ScrapeRequest) -> ApiResult[ApiScrapeResponse]:
232+
return await self._post("/scrape", params, ApiScrapeResponse)
233+
234+
async def extract(self, params: ExtractRequest) -> ApiResult[ApiExtractResponse]:
235+
return await self._post("/extract", params, ApiExtractResponse)
236+
237+
async def search(self, params: SearchRequest) -> ApiResult[ApiSearchResponse]:
238+
return await self._post("/search", params, ApiSearchResponse)
239+
240+
async def credits(self) -> ApiResult[ApiCreditsResponse]:
241+
return await self._get("/credits", ApiCreditsResponse)
242+
243+
async def health(self) -> ApiResult[ApiHealthResponse]:
244+
return await self._request("GET", "/healthz", ApiHealthResponse, base_url=env.health_url)
245+
246+
async def close(self) -> None:
247+
await self._http.aclose()
248+
249+
async def __aenter__(self) -> AsyncScrapeGraphAI:
250+
return self
251+
252+
async def __aexit__(self, *args) -> None:
253+
await self.close()

0 commit comments

Comments
 (0)