Skip to content

Commit 6959035

Browse files
committed
feat(api): align python sdk with sgai v2
1 parent 5d0606a commit 6959035

14 files changed

Lines changed: 1315 additions & 1071 deletions

File tree

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 155 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,22 @@
1919

2020
from aiohttp import ClientSession, ClientTimeout, TCPConnector
2121
from aiohttp.client_exceptions import ClientError
22-
from pydantic import BaseModel
2322

2423
from scrapegraph_py.config import API_BASE_URL, DEFAULT_HEADERS
2524
from scrapegraph_py.exceptions import APIError
2625
from scrapegraph_py.logger import sgai_logger as logger
27-
from scrapegraph_py.models.crawl import CrawlFormat, CrawlRequest
28-
from scrapegraph_py.models.extract import ExtractRequest
29-
from scrapegraph_py.models.history import HistoryFilter
30-
from scrapegraph_py.models.monitor import MonitorCreateRequest
31-
from scrapegraph_py.models.scrape import ScrapeFormat, ScrapeRequest
32-
from scrapegraph_py.models.search import SearchRequest
3326
from scrapegraph_py.models.shared import FetchConfig, LlmConfig
3427
from scrapegraph_py.utils.helpers import handle_async_response, validate_api_key
28+
from scrapegraph_py.utils.request_builders import (
29+
build_crawl_payload,
30+
build_extract_payload,
31+
build_history_params,
32+
build_monitor_payload,
33+
build_schema_payload,
34+
build_scrape_payload,
35+
build_search_payload,
36+
build_validate_params,
37+
)
3538

3639

3740
class _AsyncCrawlNamespace:
@@ -43,26 +46,37 @@ def __init__(self, client: "AsyncClient"):
4346
async def start(
4447
self,
4548
url: str,
46-
depth: int = 2,
49+
depth: Optional[int] = None,
4750
max_pages: int = 10,
4851
format: str = "markdown",
4952
include_patterns: Optional[List[str]] = None,
5053
exclude_patterns: Optional[List[str]] = None,
5154
fetch_config: Optional[FetchConfig] = None,
55+
formats: Optional[List[Dict[str, Any]]] = None,
56+
max_depth: Optional[int] = None,
57+
max_links_per_page: int = 10,
58+
allow_external: bool = False,
59+
content_types: Optional[List[str]] = None,
5260
) -> Dict[str, Any]:
5361
"""Start a crawl job."""
5462
logger.info(f"Starting crawl for {url}")
55-
request = CrawlRequest(
56-
url=url,
57-
depth=depth,
58-
max_pages=max_pages,
59-
format=CrawlFormat(format),
60-
include_patterns=include_patterns,
61-
exclude_patterns=exclude_patterns,
62-
fetch_config=fetch_config,
63-
)
6463
return await self._client._make_request(
65-
"POST", f"{self._client.base_url}/crawl", json=request.model_dump()
64+
"POST",
65+
f"{self._client.base_url}/crawl",
66+
json=build_crawl_payload(
67+
url,
68+
depth=depth,
69+
max_pages=max_pages,
70+
format=format,
71+
include_patterns=include_patterns,
72+
exclude_patterns=exclude_patterns,
73+
fetch_config=fetch_config,
74+
formats=formats,
75+
max_depth=max_depth,
76+
max_links_per_page=max_links_per_page,
77+
allow_external=allow_external,
78+
content_types=content_types,
79+
),
6680
)
6781

6882
async def status(self, crawl_id: str) -> Dict[str, Any]:
@@ -95,27 +109,34 @@ def __init__(self, client: "AsyncClient"):
95109

96110
async def create(
97111
self,
98-
name: str,
112+
name: Optional[str],
99113
url: str,
100-
prompt: str,
114+
prompt: Optional[str],
101115
interval: str,
102116
output_schema: Optional[Dict[str, Any]] = None,
103117
fetch_config: Optional[FetchConfig] = None,
104118
llm_config: Optional[LlmConfig] = None,
119+
schema: Optional[Any] = None,
120+
formats: Optional[List[Dict[str, Any]]] = None,
121+
webhook_url: Optional[str] = None,
105122
) -> Dict[str, Any]:
106123
"""Create a new monitor."""
107124
logger.info(f"Creating monitor '{name}' for {url}")
108-
request = MonitorCreateRequest(
109-
name=name,
110-
url=url,
111-
prompt=prompt,
112-
interval=interval,
113-
output_schema=output_schema,
114-
fetch_config=fetch_config,
115-
llm_config=llm_config,
116-
)
117125
return await self._client._make_request(
118-
"POST", f"{self._client.base_url}/monitor", json=request.model_dump()
126+
"POST",
127+
f"{self._client.base_url}/monitor",
128+
json=build_monitor_payload(
129+
name=name,
130+
url=url,
131+
prompt=prompt,
132+
interval=interval,
133+
output_schema=output_schema,
134+
fetch_config=fetch_config,
135+
llm_config=llm_config,
136+
schema=schema,
137+
formats=formats,
138+
webhook_url=webhook_url,
139+
),
119140
)
120141

121142
async def list(self) -> Dict[str, Any]:
@@ -188,7 +209,7 @@ def from_env(
188209

189210
def __init__(
190211
self,
191-
api_key: str = None,
212+
api_key: Optional[str] = None,
192213
base_url: Optional[str] = None,
193214
verify_ssl: bool = True,
194215
timeout: Optional[float] = None,
@@ -283,22 +304,27 @@ async def scrape(
283304
url: str,
284305
format: str = "markdown",
285306
fetch_config: Optional[FetchConfig] = None,
307+
formats: Optional[List[Dict[str, Any]]] = None,
308+
content_type: Optional[str] = None,
286309
) -> Dict[str, Any]:
287310
"""Scrape a page and return it in the specified format.
288311
289312
Args:
290313
url: URL to scrape
291-
format: Output format - 'markdown', 'html', 'screenshot', or 'branding'
314+
format: Legacy single output format
292315
fetch_config: Fetch configuration options
293316
"""
294317
logger.info(f"Scraping {url} (format={format})")
295-
request = ScrapeRequest(
296-
url=url,
297-
format=ScrapeFormat(format),
298-
fetch_config=fetch_config,
299-
)
300318
return await self._make_request(
301-
"POST", f"{self.base_url}/scrape", json=request.model_dump()
319+
"POST",
320+
f"{self.base_url}/scrape",
321+
json=build_scrape_payload(
322+
url,
323+
format=format,
324+
fetch_config=fetch_config,
325+
formats=formats,
326+
content_type=content_type,
327+
),
302328
)
303329

304330
# ------------------------------------------------------------------
@@ -307,43 +333,43 @@ async def scrape(
307333

308334
async def extract(
309335
self,
310-
url: str,
336+
url: Optional[str],
311337
prompt: str,
312338
output_schema: Optional[Any] = None,
313339
fetch_config: Optional[FetchConfig] = None,
314340
llm_config: Optional[LlmConfig] = None,
341+
*,
342+
schema: Optional[Any] = None,
343+
mode: str = "normal",
344+
content_type: Optional[str] = None,
345+
html: Optional[str] = None,
346+
markdown: Optional[str] = None,
315347
) -> Dict[str, Any]:
316348
"""Extract structured data from a page using AI.
317349
318350
Args:
319351
url: URL to extract data from
320352
prompt: Natural language prompt describing what to extract
321-
output_schema: JSON Schema dict or Pydantic BaseModel class for output structure
353+
output_schema: Legacy alias for schema
322354
fetch_config: Fetch configuration options
323-
llm_config: LLM configuration options
355+
llm_config: Deprecated and ignored by the SGAI v2 extract route
324356
"""
325357
logger.info(f"Extracting from {url}")
326-
327-
schema_dict = None
328-
if output_schema is not None:
329-
if isinstance(output_schema, type) and issubclass(output_schema, BaseModel):
330-
schema_dict = output_schema.model_json_schema()
331-
elif isinstance(output_schema, dict):
332-
schema_dict = output_schema
333-
else:
334-
raise ValueError(
335-
"output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class"
336-
)
337-
338-
request = ExtractRequest(
339-
url=url,
340-
prompt=prompt,
341-
output_schema=schema_dict,
342-
fetch_config=fetch_config,
343-
llm_config=llm_config,
344-
)
345358
return await self._make_request(
346-
"POST", f"{self.base_url}/extract", json=request.model_dump()
359+
"POST",
360+
f"{self.base_url}/extract",
361+
json=build_extract_payload(
362+
url=url,
363+
prompt=prompt,
364+
output_schema=output_schema,
365+
fetch_config=fetch_config,
366+
llm_config=llm_config,
367+
schema=schema,
368+
mode=mode,
369+
content_type=content_type,
370+
html=html,
371+
markdown=markdown,
372+
),
347373
)
348374

349375
# ------------------------------------------------------------------
@@ -357,38 +383,40 @@ async def search(
357383
output_schema: Optional[Any] = None,
358384
location_geo_code: Optional[str] = None,
359385
llm_config: Optional[LlmConfig] = None,
386+
*,
387+
schema: Optional[Any] = None,
388+
prompt: Optional[str] = None,
389+
format: str = "markdown",
390+
mode: str = "prune",
391+
fetch_config: Optional[FetchConfig] = None,
392+
time_range: Optional[str] = None,
360393
) -> Dict[str, Any]:
361394
"""Search the web and extract structured results.
362395
363396
Args:
364397
query: The search query
365398
num_results: Number of results (3-20, default 5)
366-
output_schema: JSON Schema dict or Pydantic BaseModel class for output structure
367-
location_geo_code: Two-letter country code for geo-targeted results (e.g. 'us', 'gb')
368-
llm_config: LLM configuration options
399+
output_schema: Legacy alias for schema
400+
location_geo_code: Geo code for geo-targeted results
401+
llm_config: Deprecated and ignored by the SGAI v2 search route
369402
"""
370403
logger.info(f"Searching: {query}")
371-
372-
schema_dict = None
373-
if output_schema is not None:
374-
if isinstance(output_schema, type) and issubclass(output_schema, BaseModel):
375-
schema_dict = output_schema.model_json_schema()
376-
elif isinstance(output_schema, dict):
377-
schema_dict = output_schema
378-
else:
379-
raise ValueError(
380-
"output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class"
381-
)
382-
383-
request = SearchRequest(
384-
query=query,
385-
num_results=num_results,
386-
output_schema=schema_dict,
387-
location_geo_code=location_geo_code,
388-
llm_config=llm_config,
389-
)
390404
return await self._make_request(
391-
"POST", f"{self.base_url}/search", json=request.model_dump()
405+
"POST",
406+
f"{self.base_url}/search",
407+
json=build_search_payload(
408+
query=query,
409+
num_results=num_results,
410+
output_schema=output_schema,
411+
location_geo_code=location_geo_code,
412+
llm_config=llm_config,
413+
schema=schema,
414+
prompt=prompt,
415+
format=format,
416+
mode=mode,
417+
fetch_config=fetch_config,
418+
time_range=time_range,
419+
),
392420
)
393421

394422
# ------------------------------------------------------------------
@@ -410,22 +438,58 @@ async def history(
410438
status: Optional[str] = None,
411439
limit: Optional[int] = None,
412440
offset: Optional[int] = None,
441+
*,
442+
page: Optional[int] = None,
443+
service: Optional[str] = None,
413444
) -> Dict[str, Any]:
414445
"""Retrieve request history.
415446
416447
Args:
417-
endpoint: Filter by endpoint name (e.g. 'scrape', 'extract')
418-
status: Filter by request status
448+
endpoint: Legacy alias for service
449+
status: Unsupported in SGAI v2
419450
limit: Maximum number of results (1-100)
420-
offset: Number of results to skip
451+
offset: Legacy alias mapped onto page when possible
421452
"""
422453
logger.info("Fetching history")
423-
filter_obj = HistoryFilter(
424-
endpoint=endpoint, status=status, limit=limit, offset=offset
454+
return await self._make_request(
455+
"GET",
456+
f"{self.base_url}/history",
457+
params=build_history_params(
458+
endpoint=endpoint,
459+
status=status,
460+
limit=limit,
461+
offset=offset,
462+
page=page,
463+
service=service,
464+
)
465+
or None,
466+
)
467+
468+
# ------------------------------------------------------------------
469+
# Schema / Validate
470+
# ------------------------------------------------------------------
471+
472+
async def schema(
473+
self,
474+
prompt: str,
475+
existing_schema: Optional[Any] = None,
476+
model: Optional[str] = None,
477+
) -> Dict[str, Any]:
478+
"""Generate or refine a JSON schema from a prompt."""
479+
logger.info("Generating schema")
480+
return await self._make_request(
481+
"POST",
482+
f"{self.base_url}/schema",
483+
json=build_schema_payload(
484+
prompt, existing_schema=existing_schema, model=model
485+
),
425486
)
426-
params = filter_obj.to_params()
487+
488+
async def validate(self, email: str) -> Dict[str, Any]:
489+
"""Validate an email address against SGAI's allowlist endpoint."""
490+
logger.info("Validating email")
427491
return await self._make_request(
428-
"GET", f"{self.base_url}/history", params=params or None
492+
"GET", f"{self.base_url}/validate", params=build_validate_params(email)
429493
)
430494

431495
# ------------------------------------------------------------------

0 commit comments

Comments
 (0)