Skip to content

Commit 6e7cfed

Browse files
committed
Implemented caching repositories via topics
1 parent 6d3bda6 commit 6e7cfed

1 file changed

Lines changed: 236 additions & 1 deletion

File tree

scripts/fetch_all_categories.py

Lines changed: 236 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424
"trending": os.environ.get("GH_TOKEN_TRENDING"),
2525
"new-releases": os.environ.get("GH_TOKEN_NEW_RELEASES"),
2626
"most-popular": os.environ.get("GH_TOKEN_MOST_POPULAR"),
27+
"topics": os.environ.get("GH_TOKEN_TOPICS"),
2728
}
2829
FALLBACK_TOKEN = os.environ.get("GITHUB_TOKEN")
2930

3031
# Ensure at least one token is available
3132
_any_token = any(CATEGORY_TOKENS.values()) or FALLBACK_TOKEN
3233
if not _any_token:
3334
print("ERROR: No GitHub tokens set. Set GH_TOKEN_TRENDING / GH_TOKEN_NEW_RELEASES / "
34-
"GH_TOKEN_MOST_POPULAR, or GITHUB_TOKEN as fallback.", file=sys.stderr)
35+
"GH_TOKEN_MOST_POPULAR / GH_TOKEN_TOPICS, or GITHUB_TOKEN as fallback.", file=sys.stderr)
3536
sys.exit(1)
3637

3738

@@ -848,6 +849,165 @@ async def fetch_most_popular(client: GitHubClient, platform: str, budget: Option
848849
return [r.to_summary("most-popular") for r in verified]
849850

850851

852+
# ─── Topic definitions ─────────────────────────────────────────────────────
853+
854+
# Maps topic category name → list of GitHub topics to search for.
855+
# Mirrors TopicCategory enum in the app (feature/home/domain/model/TopicCategory.kt).
856+
TOPIC_CATEGORIES = {
857+
"privacy": {
858+
"topics": [
859+
"privacy", "security", "encryption", "vpn", "firewall",
860+
"password-manager", "privacy-tools", "e2ee", "secure",
861+
"anonymity", "tor", "pgp", "2fa", "auth",
862+
],
863+
"keywords": ["privacy", "security", "encryption", "vpn", "firewall"],
864+
},
865+
"media": {
866+
"topics": [
867+
"music-player", "video-player", "media", "podcast", "streaming",
868+
"audio", "video", "media-player", "music", "player",
869+
"mpv", "vlc", "recorder", "screen-recorder", "gallery",
870+
],
871+
"keywords": ["music-player", "video-player", "media", "podcast", "audio"],
872+
},
873+
"productivity": {
874+
"topics": [
875+
"productivity", "file-manager", "notes", "launcher", "keyboard",
876+
"browser", "calendar", "todo", "note-taking", "editor",
877+
"organizer", "task-manager", "markdown", "writing",
878+
],
879+
"keywords": ["productivity", "file-manager", "notes", "launcher", "browser"],
880+
},
881+
"networking": {
882+
"topics": [
883+
"proxy", "dns", "ad-blocker", "torrent", "downloader",
884+
"network", "ssh", "wireguard", "adblock", "download-manager",
885+
"firewall", "socks5", "http-proxy", "p2p", "ftp",
886+
],
887+
"keywords": ["proxy", "dns", "ad-blocker", "torrent", "downloader", "network"],
888+
},
889+
"dev-tools": {
890+
"topics": [
891+
"terminal", "developer-tools", "git-client", "editor", "cli",
892+
"ide", "devtools", "code-editor", "terminal-emulator", "development",
893+
"adb", "debugger", "api-client", "shell", "sdk",
894+
],
895+
"keywords": ["terminal", "developer-tools", "git-client", "code-editor", "cli"],
896+
},
897+
}
898+
899+
900+
async def fetch_topic(
901+
client: GitHubClient,
902+
platform: str,
903+
topic_name: str,
904+
topic_config: Dict,
905+
budget: Optional[int] = None,
906+
) -> List[Dict]:
907+
"""Fetch repos matching a topic category for a platform."""
908+
print(f"\n{'='*60}")
909+
print(f"TOPIC: {topic_name.upper()}{platform.upper()}")
910+
print(f"{'='*60}")
911+
912+
topics = topic_config["topics"]
913+
keywords = topic_config["keywords"]
914+
platform_topics = PLATFORMS[platform]["topics"]
915+
all_langs = PLATFORMS[platform]["languages"]["primary"] + PLATFORMS[platform]["languages"]["secondary"]
916+
seen: Set[str] = set()
917+
918+
one_year = (datetime.utcnow() - timedelta(days=365)).strftime("%Y-%m-%d")
919+
two_years = (datetime.utcnow() - timedelta(days=730)).strftime("%Y-%m-%d")
920+
921+
specs = []
922+
923+
# 1) Topic-based searches: cross topic categories with platform topics
924+
# e.g. "topic:privacy topic:android" — most relevant results
925+
for topic in topics[:8]: # top 8 topics to avoid too many queries
926+
base = f"stars:>10 archived:false pushed:>={one_year}"
927+
specs.append({
928+
"query": _build_query(base, topics=[topic] + platform_topics[:2]),
929+
"sort": "stars", "pages": 3, "weight": 1.5,
930+
})
931+
932+
# 2) Keyword in description/name + platform topic
933+
for kw in keywords[:3]:
934+
base = f"stars:>20 archived:false pushed:>={one_year}"
935+
specs.append({
936+
"query": _build_query(base, topics=platform_topics[:2], description_kw=kw),
937+
"sort": "stars", "pages": 3, "weight": 1.2,
938+
})
939+
940+
# 3) Topic + primary language (catches repos without platform topic)
941+
for topic in topics[:5]:
942+
for lang in all_langs[:2]:
943+
base = f"stars:>20 archived:false pushed:>={one_year}"
944+
specs.append({
945+
"query": _build_query(base, topics=[topic], language=lang),
946+
"sort": "stars", "pages": 3, "weight": 1.0,
947+
})
948+
949+
# 4) Broader: topic only, high stars (platform-agnostic gems)
950+
for topic in topics[:5]:
951+
base = f"stars:>500 archived:false pushed:>={two_years}"
952+
specs.append({
953+
"query": _build_query(base, topics=[topic]),
954+
"sort": "stars", "pages": 3, "weight": 0.8,
955+
})
956+
957+
candidates = await _collect_candidates(
958+
client, specs, platform, seen, compute_velocity=False, min_score=0,
959+
)
960+
print(f" {len(candidates)} candidates from {len(seen)} unique repos")
961+
962+
# Sort by score (platform relevance) then stars
963+
candidates.sort(key=lambda c: (c.score, c.stars), reverse=True)
964+
verified = await verify_installers(client, candidates, platform, need_release_date=True, budget=budget)
965+
966+
verified.sort(key=lambda c: (c.score, c.stars), reverse=True)
967+
print(f" ✓ {len(verified)} {topic_name} repos verified for {platform}")
968+
return [r.to_summary("topic") for r in verified]
969+
970+
971+
async def process_topics(client: GitHubClient, timestamp: str):
972+
"""Process all 5 topic categories across all platforms."""
973+
print(f"\n{'#'*70}")
974+
print(f"# TOPICS (5 categories × {len(PLATFORMS)} platforms)")
975+
print(f"{'#'*70}")
976+
977+
num_topics = len(TOPIC_CATEGORIES)
978+
num_platforms = len(PLATFORMS)
979+
total_slots = num_topics * num_platforms
980+
981+
for topic_idx, (topic_name, topic_config) in enumerate(TOPIC_CATEGORIES.items()):
982+
print(f"\n--- Topic {topic_idx + 1}/{num_topics}: {topic_name} ---")
983+
984+
for platform_idx, platform in enumerate(PLATFORMS.keys()):
985+
slot = topic_idx * num_platforms + platform_idx
986+
slots_remaining = total_slots - slot
987+
budget = max((client._rate_remaining - RATE_LIMIT_FLOOR) // slots_remaining, 50)
988+
989+
cached = load_cache(f"topics/{topic_name}", platform)
990+
if cached:
991+
continue
992+
993+
repos = await fetch_topic(client, platform, topic_name, topic_config, budget)
994+
995+
if len(repos) == 0:
996+
existing = _load_existing_count(f"topics/{topic_name}", platform)
997+
print(f" ⚠ 0 repos fetched — skipping save (existing cache: {existing} repos)")
998+
continue
999+
1000+
# Lower threshold for topics — even 5 repos is useful
1001+
min_threshold = 5
1002+
if len(repos) < min_threshold:
1003+
existing = _load_existing_count(f"topics/{topic_name}", platform)
1004+
if existing >= min_threshold:
1005+
print(f" ⚠ Only {len(repos)} repos fetched but cache has {existing} — keeping cached data")
1006+
continue
1007+
1008+
save_data(f"topics/{topic_name}", platform, repos, timestamp)
1009+
1010+
8511011
# ─── Cache I/O ─────────────────────────────────────────────────────────────────
8521012

8531013

@@ -1009,6 +1169,81 @@ async def main():
10091169
f"{len(client.release_cache)} release cache entries, "
10101170
f"{client._rate_remaining} requests remaining")
10111171

1172+
# ─── Phase 2: Topics ─────────────────────────────────────────────────────
1173+
# Use GH_TOKEN_TOPICS (dedicated) plus scavenge leftover budget from
1174+
# the 3 category tokens.
1175+
1176+
topics_token = CATEGORY_TOKENS.get("topics") or FALLBACK_TOKEN
1177+
if topics_token:
1178+
print(f"\n{'#'*70}")
1179+
print(f"# PHASE 2: TOPICS")
1180+
print(f"{'#'*70}")
1181+
1182+
# Collect all usable tokens: dedicated topics token + leftover from category tokens
1183+
topic_tokens = [topics_token]
1184+
for cat_name in ["trending", "new-releases", "most-popular"]:
1185+
cat_token = CATEGORY_TOKENS.get(cat_name)
1186+
if cat_token and cat_token != topics_token:
1187+
topic_tokens.append(cat_token)
1188+
1189+
# Deduplicate while preserving order
1190+
seen_tokens: Set[str] = set()
1191+
unique_tokens = []
1192+
for t in topic_tokens:
1193+
if t not in seen_tokens:
1194+
seen_tokens.add(t)
1195+
unique_tokens.append(t)
1196+
1197+
print(f" Using {len(unique_tokens)} token(s) for topics")
1198+
1199+
# Check remaining budget on each token
1200+
usable_clients = []
1201+
for i, tok in enumerate(unique_tokens):
1202+
async with GitHubClient(tok) as probe:
1203+
data, _ = await probe.get("https://api.github.com/rate_limit")
1204+
if data:
1205+
remaining = data.get("resources", {}).get("core", {}).get("remaining", 0)
1206+
limit = data.get("resources", {}).get("core", {}).get("limit", 0)
1207+
label = "dedicated" if i == 0 else f"leftover-{i}"
1208+
print(f" Token {label}: {remaining}/{limit} remaining")
1209+
if remaining > RATE_LIMIT_FLOOR + 100:
1210+
usable_clients.append((tok, remaining))
1211+
1212+
if usable_clients:
1213+
# Use the token with the most remaining budget first
1214+
usable_clients.sort(key=lambda x: x[1], reverse=True)
1215+
best_token, best_remaining = usable_clients[0]
1216+
print(f" Selected token with {best_remaining} remaining requests")
1217+
1218+
async with GitHubClient(best_token) as client:
1219+
client._rate_remaining = best_remaining
1220+
await process_topics(client, timestamp)
1221+
total_requests += client._request_count
1222+
total_cache_entries += len(client.release_cache)
1223+
print(f" [topics] Used {client._request_count} API requests, "
1224+
f"{len(client.release_cache)} release cache entries, "
1225+
f"{client._rate_remaining} requests remaining")
1226+
1227+
# If first token ran low, try remaining tokens
1228+
if client._rate_remaining <= RATE_LIMIT_FLOOR + 50 and len(usable_clients) > 1:
1229+
for tok, rem in usable_clients[1:]:
1230+
print(f"\n Switching to next token ({rem} remaining)...")
1231+
async with GitHubClient(tok) as fallback_client:
1232+
fallback_client._rate_remaining = rem
1233+
# Share release cache for efficiency
1234+
fallback_client.release_cache = client.release_cache
1235+
await process_topics(fallback_client, timestamp)
1236+
total_requests += fallback_client._request_count
1237+
total_cache_entries += len(fallback_client.release_cache)
1238+
print(f" [topics-fallback] Used {fallback_client._request_count} API requests, "
1239+
f"{fallback_client._rate_remaining} remaining")
1240+
if fallback_client._rate_remaining > RATE_LIMIT_FLOOR + 50:
1241+
break # Still has budget, done
1242+
else:
1243+
print(" ⚠ No tokens with enough budget for topics — skipping")
1244+
else:
1245+
print("\n⚠ No token available for topics — set GH_TOKEN_TOPICS or GITHUB_TOKEN")
1246+
10121247
elapsed = time.time() - start
10131248
print(f"\n{'='*70}")
10141249
print(f"✓ DONE in {elapsed:.0f}s ({elapsed/60:.1f} min)")

0 commit comments

Comments
 (0)