Skip to content

Commit 1550cc3

Browse files
committed
Added collections import and two new constants: SEARCH_RATE_LIMIT = 28, SEARCH_RATE_WINDOW = 60
Added sliding window rate limiter to GitHubClient: _search_call_times deque + _search_rate_lock for thread-safe tracking _acquire_search_slot() method that tracks timestamps and auto-pauses when approaching 30 req/min Integrated into get() — any call to /search/ endpoints automatically goes through the rate limiter before proceeding Added error logging in search_repos() — failed queries now print the error reason instead of silently returning [] Removed the 65s inter-platform pause — no longer needed since the rate limiter paces calls automatically across all platforms
1 parent 5d4ae25 commit 1550cc3

1 file changed

Lines changed: 25 additions & 9 deletions

File tree

scripts/fetch_all_categories.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import asyncio
1212
import aiohttp
1313
import time
14+
import collections
1415
from datetime import datetime, timedelta, timezone
1516
from typing import List, Dict, Optional, Tuple, Set
1617
from dataclasses import dataclass, field
@@ -55,6 +56,8 @@ def get_token(category: str) -> str:
5556
REQUEST_TIMEOUT = 20 # seconds
5657
MAX_RETRIES = 3
5758
RATE_LIMIT_FLOOR = 50 # stop verifying when remaining requests drop below this
59+
SEARCH_RATE_LIMIT = 28 # max search API calls per window (GitHub allows 30, leave buffer)
60+
SEARCH_RATE_WINDOW = 60 # sliding window in seconds
5861
FORCE_REFRESH = os.environ.get("FORCE_REFRESH", "").lower() in ("true", "1", "yes")
5962

6063
# Topics / keywords that indicate NSFW or inappropriate content.
@@ -244,6 +247,9 @@ def __init__(self, token: str):
244247
self._request_count = 0
245248
self._rate_remaining = 5000
246249
self._rate_reset: Optional[float] = None
250+
# Search API rate limiter (sliding window)
251+
self._search_call_times: collections.deque = collections.deque()
252+
self._search_rate_lock = asyncio.Lock()
247253
# Cross-repo release cache: full_name -> ReleaseInfo
248254
self.release_cache: Dict[str, ReleaseInfo] = {}
249255

@@ -282,8 +288,24 @@ def _update_rate_info(self, headers, url: str):
282288
if reset is not None:
283289
self._rate_reset = float(reset)
284290

291+
async def _acquire_search_slot(self):
292+
"""Ensure we stay within the search API rate limit (30 req/min)."""
293+
async with self._search_rate_lock:
294+
now = time.time()
295+
while self._search_call_times and now - self._search_call_times[0] >= SEARCH_RATE_WINDOW:
296+
self._search_call_times.popleft()
297+
if len(self._search_call_times) >= SEARCH_RATE_LIMIT:
298+
oldest = self._search_call_times[0]
299+
wait = SEARCH_RATE_WINDOW - (now - oldest) + 1
300+
if wait > 0:
301+
print(f" ⏳ Search rate limit: {len(self._search_call_times)}/{SEARCH_RATE_LIMIT} in last 60s, pausing {wait:.0f}s")
302+
await asyncio.sleep(wait)
303+
self._search_call_times.append(time.time())
304+
285305
async def get(self, url: str, params: Optional[Dict] = None) -> Tuple[Optional[Dict], Optional[str]]:
286306
"""GET with retry, rate-limit handling, and concurrency control."""
307+
if "/search/" in url:
308+
await self._acquire_search_slot()
287309
async with self._sem:
288310
await self._wait_for_rate_limit()
289311
for attempt in range(MAX_RETRIES):
@@ -333,6 +355,7 @@ async def search_repos(self, query: str, sort: str = "stars", order: str = "desc
333355
params={"q": query, "sort": sort, "order": order, "per_page": 100, "page": 1},
334356
)
335357
if not data:
358+
print(f" ⚠ Search query failed: {err}")
336359
return []
337360

338361
items = data.get("items", [])
@@ -941,21 +964,14 @@ async def process_platform(platform: str, budget: int):
941964

942965
# Process platforms SEQUENTIALLY to avoid rate-limit thrashing.
943966
# The release cache still benefits later platforms from earlier ones.
944-
# Pause between platforms so the search API rate limit (30 req/min) can reset.
967+
# Search API pacing is handled by _acquire_search_slot() (sliding window rate limiter).
945968
platforms_left = list(PLATFORMS.keys())
946-
prev_ran_searches = False
947969
for i, p in enumerate(platforms_left):
948970
# Recalculate budget for remaining platforms so unused budget carries forward
949971
platforms_remaining = num_platforms - i
950972
budget = max((client._rate_remaining - RATE_LIMIT_FLOOR) // platforms_remaining, 100)
951973

952-
# Wait for search rate limit (30 req/min) to reset between platforms
953-
if prev_ran_searches:
954-
print(f" ⏳ Waiting 65s for search API rate limit reset...")
955-
await asyncio.sleep(65)
956-
957-
ran = await process_platform(p, budget)
958-
prev_ran_searches = ran
974+
await process_platform(p, budget)
959975

960976

961977
async def main():

0 commit comments

Comments
 (0)