1919
2020from aiohttp import ClientSession , ClientTimeout , TCPConnector
2121from aiohttp .client_exceptions import ClientError
22- from pydantic import BaseModel
2322
2423from scrapegraph_py .config import API_BASE_URL , DEFAULT_HEADERS
2524from scrapegraph_py .exceptions import APIError
2625from scrapegraph_py .logger import sgai_logger as logger
27- from scrapegraph_py .models .crawl import CrawlFormat , CrawlRequest
28- from scrapegraph_py .models .extract import ExtractRequest
29- from scrapegraph_py .models .history import HistoryFilter
30- from scrapegraph_py .models .monitor import MonitorCreateRequest
31- from scrapegraph_py .models .scrape import ScrapeFormat , ScrapeRequest
32- from scrapegraph_py .models .search import SearchRequest
3326from scrapegraph_py .models .shared import FetchConfig , LlmConfig
3427from scrapegraph_py .utils .helpers import handle_async_response , validate_api_key
28+ from scrapegraph_py .utils .request_builders import (
29+ build_crawl_payload ,
30+ build_extract_payload ,
31+ build_history_params ,
32+ build_monitor_payload ,
33+ build_schema_payload ,
34+ build_scrape_payload ,
35+ build_search_payload ,
36+ build_validate_params ,
37+ )
3538
3639
3740class _AsyncCrawlNamespace :
@@ -43,26 +46,37 @@ def __init__(self, client: "AsyncClient"):
4346 async def start (
4447 self ,
4548 url : str ,
46- depth : int = 2 ,
49+ depth : Optional [ int ] = None ,
4750 max_pages : int = 10 ,
4851 format : str = "markdown" ,
4952 include_patterns : Optional [List [str ]] = None ,
5053 exclude_patterns : Optional [List [str ]] = None ,
5154 fetch_config : Optional [FetchConfig ] = None ,
55+ formats : Optional [List [Dict [str , Any ]]] = None ,
56+ max_depth : Optional [int ] = None ,
57+ max_links_per_page : int = 10 ,
58+ allow_external : bool = False ,
59+ content_types : Optional [List [str ]] = None ,
5260 ) -> Dict [str , Any ]:
5361 """Start a crawl job."""
5462 logger .info (f"Starting crawl for { url } " )
55- request = CrawlRequest (
56- url = url ,
57- depth = depth ,
58- max_pages = max_pages ,
59- format = CrawlFormat (format ),
60- include_patterns = include_patterns ,
61- exclude_patterns = exclude_patterns ,
62- fetch_config = fetch_config ,
63- )
6463 return await self ._client ._make_request (
65- "POST" , f"{ self ._client .base_url } /crawl" , json = request .model_dump ()
64+ "POST" ,
65+ f"{ self ._client .base_url } /crawl" ,
66+ json = build_crawl_payload (
67+ url ,
68+ depth = depth ,
69+ max_pages = max_pages ,
70+ format = format ,
71+ include_patterns = include_patterns ,
72+ exclude_patterns = exclude_patterns ,
73+ fetch_config = fetch_config ,
74+ formats = formats ,
75+ max_depth = max_depth ,
76+ max_links_per_page = max_links_per_page ,
77+ allow_external = allow_external ,
78+ content_types = content_types ,
79+ ),
6680 )
6781
6882 async def status (self , crawl_id : str ) -> Dict [str , Any ]:
@@ -95,27 +109,34 @@ def __init__(self, client: "AsyncClient"):
95109
96110 async def create (
97111 self ,
98- name : str ,
112+ name : Optional [ str ] ,
99113 url : str ,
100- prompt : str ,
114+ prompt : Optional [ str ] ,
101115 interval : str ,
102116 output_schema : Optional [Dict [str , Any ]] = None ,
103117 fetch_config : Optional [FetchConfig ] = None ,
104118 llm_config : Optional [LlmConfig ] = None ,
119+ schema : Optional [Any ] = None ,
120+ formats : Optional [List [Dict [str , Any ]]] = None ,
121+ webhook_url : Optional [str ] = None ,
105122 ) -> Dict [str , Any ]:
106123 """Create a new monitor."""
107124 logger .info (f"Creating monitor '{ name } ' for { url } " )
108- request = MonitorCreateRequest (
109- name = name ,
110- url = url ,
111- prompt = prompt ,
112- interval = interval ,
113- output_schema = output_schema ,
114- fetch_config = fetch_config ,
115- llm_config = llm_config ,
116- )
117125 return await self ._client ._make_request (
118- "POST" , f"{ self ._client .base_url } /monitor" , json = request .model_dump ()
126+ "POST" ,
127+ f"{ self ._client .base_url } /monitor" ,
128+ json = build_monitor_payload (
129+ name = name ,
130+ url = url ,
131+ prompt = prompt ,
132+ interval = interval ,
133+ output_schema = output_schema ,
134+ fetch_config = fetch_config ,
135+ llm_config = llm_config ,
136+ schema = schema ,
137+ formats = formats ,
138+ webhook_url = webhook_url ,
139+ ),
119140 )
120141
121142 async def list (self ) -> Dict [str , Any ]:
@@ -188,7 +209,7 @@ def from_env(
188209
189210 def __init__ (
190211 self ,
191- api_key : str = None ,
212+ api_key : Optional [ str ] = None ,
192213 base_url : Optional [str ] = None ,
193214 verify_ssl : bool = True ,
194215 timeout : Optional [float ] = None ,
@@ -283,22 +304,27 @@ async def scrape(
283304 url : str ,
284305 format : str = "markdown" ,
285306 fetch_config : Optional [FetchConfig ] = None ,
307+ formats : Optional [List [Dict [str , Any ]]] = None ,
308+ content_type : Optional [str ] = None ,
286309 ) -> Dict [str , Any ]:
287310 """Scrape a page and return it in the specified format.
288311
289312 Args:
290313 url: URL to scrape
291- format: Output format - 'markdown', 'html', 'screenshot', or 'branding'
314+ format: Legacy single output format
292315 fetch_config: Fetch configuration options
293316 """
294317 logger .info (f"Scraping { url } (format={ format } )" )
295- request = ScrapeRequest (
296- url = url ,
297- format = ScrapeFormat (format ),
298- fetch_config = fetch_config ,
299- )
300318 return await self ._make_request (
301- "POST" , f"{ self .base_url } /scrape" , json = request .model_dump ()
319+ "POST" ,
320+ f"{ self .base_url } /scrape" ,
321+ json = build_scrape_payload (
322+ url ,
323+ format = format ,
324+ fetch_config = fetch_config ,
325+ formats = formats ,
326+ content_type = content_type ,
327+ ),
302328 )
303329
304330 # ------------------------------------------------------------------
@@ -307,43 +333,43 @@ async def scrape(
307333
308334 async def extract (
309335 self ,
310- url : str ,
336+ url : Optional [ str ] ,
311337 prompt : str ,
312338 output_schema : Optional [Any ] = None ,
313339 fetch_config : Optional [FetchConfig ] = None ,
314340 llm_config : Optional [LlmConfig ] = None ,
341+ * ,
342+ schema : Optional [Any ] = None ,
343+ mode : str = "normal" ,
344+ content_type : Optional [str ] = None ,
345+ html : Optional [str ] = None ,
346+ markdown : Optional [str ] = None ,
315347 ) -> Dict [str , Any ]:
316348 """Extract structured data from a page using AI.
317349
318350 Args:
319351 url: URL to extract data from
320352 prompt: Natural language prompt describing what to extract
321- output_schema: JSON Schema dict or Pydantic BaseModel class for output structure
353+ output_schema: Legacy alias for schema
322354 fetch_config: Fetch configuration options
323- llm_config: LLM configuration options
355+ llm_config: Deprecated and ignored by the SGAI v2 extract route
324356 """
325357 logger .info (f"Extracting from { url } " )
326-
327- schema_dict = None
328- if output_schema is not None :
329- if isinstance (output_schema , type ) and issubclass (output_schema , BaseModel ):
330- schema_dict = output_schema .model_json_schema ()
331- elif isinstance (output_schema , dict ):
332- schema_dict = output_schema
333- else :
334- raise ValueError (
335- "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class"
336- )
337-
338- request = ExtractRequest (
339- url = url ,
340- prompt = prompt ,
341- output_schema = schema_dict ,
342- fetch_config = fetch_config ,
343- llm_config = llm_config ,
344- )
345358 return await self ._make_request (
346- "POST" , f"{ self .base_url } /extract" , json = request .model_dump ()
359+ "POST" ,
360+ f"{ self .base_url } /extract" ,
361+ json = build_extract_payload (
362+ url = url ,
363+ prompt = prompt ,
364+ output_schema = output_schema ,
365+ fetch_config = fetch_config ,
366+ llm_config = llm_config ,
367+ schema = schema ,
368+ mode = mode ,
369+ content_type = content_type ,
370+ html = html ,
371+ markdown = markdown ,
372+ ),
347373 )
348374
349375 # ------------------------------------------------------------------
@@ -357,38 +383,40 @@ async def search(
357383 output_schema : Optional [Any ] = None ,
358384 location_geo_code : Optional [str ] = None ,
359385 llm_config : Optional [LlmConfig ] = None ,
386+ * ,
387+ schema : Optional [Any ] = None ,
388+ prompt : Optional [str ] = None ,
389+ format : str = "markdown" ,
390+ mode : str = "prune" ,
391+ fetch_config : Optional [FetchConfig ] = None ,
392+ time_range : Optional [str ] = None ,
360393 ) -> Dict [str , Any ]:
361394 """Search the web and extract structured results.
362395
363396 Args:
364397 query: The search query
365398 num_results: Number of results (3-20, default 5)
366- output_schema: JSON Schema dict or Pydantic BaseModel class for output structure
367- location_geo_code: Two-letter country code for geo-targeted results (e.g. 'us', 'gb')
368- llm_config: LLM configuration options
399+ output_schema: Legacy alias for schema
400+ location_geo_code: Geo code for geo-targeted results
401+ llm_config: Deprecated and ignored by the SGAI v2 search route
369402 """
370403 logger .info (f"Searching: { query } " )
371-
372- schema_dict = None
373- if output_schema is not None :
374- if isinstance (output_schema , type ) and issubclass (output_schema , BaseModel ):
375- schema_dict = output_schema .model_json_schema ()
376- elif isinstance (output_schema , dict ):
377- schema_dict = output_schema
378- else :
379- raise ValueError (
380- "output_schema must be a dict (JSON Schema) or a Pydantic BaseModel class"
381- )
382-
383- request = SearchRequest (
384- query = query ,
385- num_results = num_results ,
386- output_schema = schema_dict ,
387- location_geo_code = location_geo_code ,
388- llm_config = llm_config ,
389- )
390404 return await self ._make_request (
391- "POST" , f"{ self .base_url } /search" , json = request .model_dump ()
405+ "POST" ,
406+ f"{ self .base_url } /search" ,
407+ json = build_search_payload (
408+ query = query ,
409+ num_results = num_results ,
410+ output_schema = output_schema ,
411+ location_geo_code = location_geo_code ,
412+ llm_config = llm_config ,
413+ schema = schema ,
414+ prompt = prompt ,
415+ format = format ,
416+ mode = mode ,
417+ fetch_config = fetch_config ,
418+ time_range = time_range ,
419+ ),
392420 )
393421
394422 # ------------------------------------------------------------------
@@ -410,22 +438,58 @@ async def history(
410438 status : Optional [str ] = None ,
411439 limit : Optional [int ] = None ,
412440 offset : Optional [int ] = None ,
441+ * ,
442+ page : Optional [int ] = None ,
443+ service : Optional [str ] = None ,
413444 ) -> Dict [str , Any ]:
414445 """Retrieve request history.
415446
416447 Args:
417- endpoint: Filter by endpoint name (e.g. 'scrape', 'extract')
418- status: Filter by request status
448+ endpoint: Legacy alias for service
449+ status: Unsupported in SGAI v2
419450 limit: Maximum number of results (1-100)
420- offset: Number of results to skip
451+ offset: Legacy alias mapped onto page when possible
421452 """
422453 logger .info ("Fetching history" )
423- filter_obj = HistoryFilter (
424- endpoint = endpoint , status = status , limit = limit , offset = offset
454+ return await self ._make_request (
455+ "GET" ,
456+ f"{ self .base_url } /history" ,
457+ params = build_history_params (
458+ endpoint = endpoint ,
459+ status = status ,
460+ limit = limit ,
461+ offset = offset ,
462+ page = page ,
463+ service = service ,
464+ )
465+ or None ,
466+ )
467+
468+ # ------------------------------------------------------------------
469+ # Schema / Validate
470+ # ------------------------------------------------------------------
471+
472+ async def schema (
473+ self ,
474+ prompt : str ,
475+ existing_schema : Optional [Any ] = None ,
476+ model : Optional [str ] = None ,
477+ ) -> Dict [str , Any ]:
478+ """Generate or refine a JSON schema from a prompt."""
479+ logger .info ("Generating schema" )
480+ return await self ._make_request (
481+ "POST" ,
482+ f"{ self .base_url } /schema" ,
483+ json = build_schema_payload (
484+ prompt , existing_schema = existing_schema , model = model
485+ ),
425486 )
426- params = filter_obj .to_params ()
487+
488+ async def validate (self , email : str ) -> Dict [str , Any ]:
489+ """Validate an email address against SGAI's allowlist endpoint."""
490+ logger .info ("Validating email" )
427491 return await self ._make_request (
428- "GET" , f"{ self .base_url } /history " , params = params or None
492+ "GET" , f"{ self .base_url } /validate " , params = build_validate_params ( email )
429493 )
430494
431495 # ------------------------------------------------------------------
0 commit comments