Skip to content

Commit b5374d7

Browse files
committed
Update fetching logic for topic repos
1 parent a07df28 commit b5374d7

1 file changed

Lines changed: 143 additions & 47 deletions

File tree

scripts/fetch_all_categories.py

Lines changed: 143 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -897,68 +897,60 @@ async def fetch_most_popular(client: GitHubClient, platform: str, budget: Option
897897
}
898898

899899

900-
async def fetch_topic(
900+
async def search_topic_candidates(
901901
client: GitHubClient,
902-
platform: str,
903902
topic_name: str,
904903
topic_config: Dict,
905-
budget: Optional[int] = None,
906-
) -> List[Dict]:
907-
"""Fetch repos matching a topic category for a platform."""
904+
) -> List[RepoCandidate]:
905+
"""Search GitHub for repos matching a topic category (platform-agnostic).
906+
907+
Returns unverified candidates — call verify_installers() per platform.
908+
"""
908909
print(f"\n{'='*60}")
909-
print(f"TOPIC: {topic_name.upper()} {platform.upper()}")
910+
print(f"TOPIC SEARCH: {topic_name.upper()} (all platforms)")
910911
print(f"{'='*60}")
911912

912913
topics = topic_config["topics"]
913914
keywords = topic_config["keywords"]
914-
platform_topics = PLATFORMS[platform]["topics"]
915-
all_langs = PLATFORMS[platform]["languages"]["primary"] + PLATFORMS[platform]["languages"]["secondary"]
916915
seen: Set[str] = set()
917916

918917
one_year = (datetime.utcnow() - timedelta(days=365)).strftime("%Y-%m-%d")
919918
two_years = (datetime.utcnow() - timedelta(days=730)).strftime("%Y-%m-%d")
920919

921920
specs = []
922921

923-
# Strategy: fewer, broader queries to stay within search rate limits.
924-
# Each query uses OR to combine multiple category topics, reducing total
925-
# search API calls from ~78 to ~18 per topic×platform.
922+
# Strategy: search by CATEGORY TOPICS ONLY — do NOT cross with platform topics.
923+
# Almost no repos tag themselves with both "terminal" AND "windows".
924+
# Platform filtering happens at the installer-verification step (checks for
925+
# .apk, .exe, .dmg, .deb etc. in release assets), which runs per-platform.
926926

927-
# 1) Batch category topics into groups of 4, cross with platform topics
928-
# e.g. "(topic:terminal OR topic:developer-tools OR topic:cli OR topic:ide)"
929-
# AND "(topic:android OR topic:android-app)"
930-
# Using AND between groups (implicit) so results must match BOTH category AND platform.
931-
# Note: search_repos() prepends "fork:true" automatically.
927+
# 1) Batch category topics into groups of 4, sorted by stars
932928
topic_batches = [topics[i:i+4] for i in range(0, min(len(topics), 12), 4)]
933929
for batch in topic_batches:
934-
base = f"stars:>10 archived:false pushed:>={one_year}"
930+
base = f"stars:>50 archived:false pushed:>={one_year}"
935931
cat_or = " OR ".join(f"topic:{t}" for t in batch)
936-
plat_or = " OR ".join(f"topic:{t}" for t in platform_topics[:2])
937932
specs.append({
938-
"query": f"{base} ({cat_or}) ({plat_or})",
933+
"query": f"{base} ({cat_or})",
939934
"sort": "stars", "pages": 3, "weight": 1.5,
940935
})
941936

942-
# 2) Keywords in name/description + platform topic (top 3 keywords)
937+
# 2) Keywords in name/description
943938
for kw in keywords[:3]:
944-
base = f"stars:>20 archived:false pushed:>={one_year}"
945-
specs.append({
946-
"query": _build_query(base, topics=platform_topics[:2], description_kw=kw),
947-
"sort": "stars", "pages": 3, "weight": 1.2,
948-
})
949-
950-
# 3) All category topics combined + primary language (catches repos without platform topic)
951-
# Single query per language instead of per-batch to reduce total API calls.
952-
cat_lang_or = " OR ".join(f"topic:{t}" for t in topics[:8])
953-
for lang in all_langs[:2]:
954-
base = f"stars:>20 archived:false pushed:>={one_year}"
939+
base = f"stars:>100 archived:false pushed:>={one_year}"
955940
specs.append({
956-
"query": f"{base} ({cat_lang_or}) language:{lang}",
957-
"sort": "stars", "pages": 2, "weight": 1.0,
941+
"query": f"{base} {kw} in:name,description",
942+
"sort": "stars", "pages": 2, "weight": 1.2,
958943
})
959944

960-
# 4) Broader: all category topics combined, high stars (platform-agnostic)
945+
# 3) All category topics combined, lower star threshold, recently updated
961946
cat_all_or = " OR ".join(f"topic:{t}" for t in topics[:8])
947+
base = f"stars:>20 archived:false pushed:>={one_year}"
948+
specs.append({
949+
"query": f"{base} ({cat_all_or})",
950+
"sort": "updated", "pages": 3, "weight": 1.0,
951+
})
952+
953+
# 4) Broader: high stars, wider time window (catches established projects)
962954
base = f"stars:>500 archived:false pushed:>={two_years}"
963955
specs.append({
964956
"query": f"{base} ({cat_all_or})",
@@ -967,46 +959,110 @@ async def fetch_topic(
967959

968960
print(f" {len(specs)} search specs ({sum(s.get('pages', 3) for s in specs)} API calls)")
969961

962+
# Use "android" as dummy platform for make_candidate scoring — score is
963+
# recalculated in verify_installers anyway, and we just need dedup here.
970964
candidates = await _collect_candidates(
971-
client, specs, platform, seen, compute_velocity=False, min_score=0,
965+
client, specs, "android", seen, compute_velocity=False, min_score=0,
972966
)
973967
print(f" {len(candidates)} candidates from {len(seen)} unique repos")
968+
candidates.sort(key=lambda c: c.stars, reverse=True)
969+
return candidates
974970

975-
# Sort by score (platform relevance) then stars
976-
candidates.sort(key=lambda c: (c.score, c.stars), reverse=True)
977-
verified = await verify_installers(client, candidates, platform, need_release_date=True, budget=budget)
978971

972+
async def verify_topic_for_platform(
973+
client: GitHubClient,
974+
candidates: List[RepoCandidate],
975+
platform: str,
976+
topic_name: str,
977+
budget: Optional[int] = None,
978+
) -> List[Dict]:
979+
"""Verify which candidates have installers for a specific platform."""
980+
print(f"\n --- {topic_name}/{platform}: verifying {len(candidates)} candidates ---")
981+
verified = await verify_installers(
982+
client, candidates, platform, need_release_date=True, budget=budget,
983+
)
979984
verified.sort(key=lambda c: (c.score, c.stars), reverse=True)
980985
print(f" ✓ {len(verified)} {topic_name} repos verified for {platform}")
981986
return [r.to_summary("topic") for r in verified]
982987

983988

984989
async def process_topics(client: GitHubClient, timestamp: str):
985-
"""Process all 5 topic categories across all platforms."""
990+
"""Process all 5 topic categories across all platforms.
991+
992+
Optimization: search queries are platform-agnostic, so we search ONCE per
993+
topic and then verify installers per-platform. This saves ~75% of search
994+
API calls compared to searching per topic×platform.
995+
"""
986996
print(f"\n{'#'*70}")
987997
print(f"# TOPICS (5 categories × {len(PLATFORMS)} platforms)")
988998
print(f"{'#'*70}")
989999

1000+
# ─── Rate limit check ─────────────────────────────────────────────────
1001+
data, _ = await client.get("https://api.github.com/rate_limit")
1002+
if data:
1003+
core = data.get("resources", {}).get("core", {})
1004+
search = data.get("resources", {}).get("search", {})
1005+
print(f" Rate limit — core: {core.get('remaining', '?')}/{core.get('limit', '?')}, "
1006+
f"search: {search.get('remaining', '?')}/{search.get('limit', '?')}")
1007+
remaining = core.get("remaining", 0)
1008+
if remaining < 500:
1009+
print(f" ⚠ WARNING: Low core rate limit ({remaining}) — results may be incomplete")
1010+
search_remaining = search.get("remaining", 0)
1011+
if search_remaining < 5:
1012+
print(f" ⚠ WARNING: Search rate limit nearly exhausted ({search_remaining})")
1013+
9901014
num_topics = len(TOPIC_CATEGORIES)
9911015
num_platforms = len(PLATFORMS)
992-
total_slots = num_topics * num_platforms
1016+
# Budget slots: 1 search + N platform verifications per topic
1017+
total_verify_slots = num_topics * num_platforms
1018+
1019+
# Track results for final summary
1020+
results_summary: Dict[str, Dict[str, int]] = {}
9931021

9941022
for topic_idx, (topic_name, topic_config) in enumerate(TOPIC_CATEGORIES.items()):
9951023
print(f"\n--- Topic {topic_idx + 1}/{num_topics}: {topic_name} ---")
1024+
results_summary[topic_name] = {}
9961025

997-
for platform_idx, platform in enumerate(PLATFORMS.keys()):
998-
slot = topic_idx * num_platforms + platform_idx
999-
slots_remaining = total_slots - slot
1000-
budget = max((client._rate_remaining - RATE_LIMIT_FLOOR) // slots_remaining, 50)
1001-
1026+
# Check which platforms still need data
1027+
platforms_needed = []
1028+
for platform in PLATFORMS.keys():
10021029
cached = load_cache(f"topics/{topic_name}", platform)
10031030
if cached:
1004-
continue
1031+
# Count existing cached repos for summary
1032+
existing = _load_existing_count(f"topics/{topic_name}", platform)
1033+
results_summary[topic_name][platform] = existing
1034+
print(f" {platform}: cached ({existing} repos)")
1035+
else:
1036+
platforms_needed.append(platform)
1037+
1038+
if not platforms_needed:
1039+
print(f" All platforms cached — skipping")
1040+
continue
1041+
1042+
# Search once for this topic (platform-agnostic)
1043+
candidates = await search_topic_candidates(client, topic_name, topic_config)
10051044

1006-
repos = await fetch_topic(client, platform, topic_name, topic_config, budget)
1045+
if not candidates:
1046+
print(f" ⚠ 0 candidates found — skipping all platforms")
1047+
for platform in platforms_needed:
1048+
existing = _load_existing_count(f"topics/{topic_name}", platform)
1049+
results_summary[topic_name][platform] = existing
1050+
print(f" {platform}: existing cache has {existing} repos")
1051+
continue
1052+
1053+
# Verify installers per platform using the shared candidate list
1054+
for platform_idx, platform in enumerate(platforms_needed):
1055+
verify_slot = topic_idx * num_platforms + platform_idx
1056+
slots_remaining = max(total_verify_slots - verify_slot, 1)
1057+
budget = max((client._rate_remaining - RATE_LIMIT_FLOOR) // slots_remaining, 50)
1058+
1059+
repos = await verify_topic_for_platform(
1060+
client, candidates, platform, topic_name, budget,
1061+
)
10071062

10081063
if len(repos) == 0:
10091064
existing = _load_existing_count(f"topics/{topic_name}", platform)
1065+
results_summary[topic_name][platform] = existing
10101066
print(f" ⚠ 0 repos fetched — skipping save (existing cache: {existing} repos)")
10111067
continue
10121068

@@ -1015,10 +1071,50 @@ async def process_topics(client: GitHubClient, timestamp: str):
10151071
if len(repos) < min_threshold:
10161072
existing = _load_existing_count(f"topics/{topic_name}", platform)
10171073
if existing >= min_threshold:
1074+
results_summary[topic_name][platform] = existing
10181075
print(f" ⚠ Only {len(repos)} repos fetched but cache has {existing} — keeping cached data")
10191076
continue
10201077

10211078
save_data(f"topics/{topic_name}", platform, repos, timestamp)
1079+
results_summary[topic_name][platform] = len(repos)
1080+
1081+
# ─── Final summary ────────────────────────────────────────────────────
1082+
print(f"\n{'='*70}")
1083+
print(f"TOPICS SUMMARY")
1084+
print(f"{'='*70}")
1085+
1086+
# Header
1087+
platforms_list = list(PLATFORMS.keys())
1088+
header = f" {'topic':<16}" + "".join(f"{p:>10}" for p in platforms_list) + f"{'total':>10}"
1089+
print(header)
1090+
print(f" {'-'*16}" + "".join(f"{'-'*10}" for _ in platforms_list) + f"{'-'*10}")
1091+
1092+
grand_total = 0
1093+
for topic_name in TOPIC_CATEGORIES:
1094+
counts = results_summary.get(topic_name, {})
1095+
row = f" {topic_name:<16}"
1096+
topic_total = 0
1097+
for p in platforms_list:
1098+
c = counts.get(p, 0)
1099+
topic_total += c
1100+
row += f"{c:>10}"
1101+
row += f"{topic_total:>10}"
1102+
grand_total += topic_total
1103+
print(row)
1104+
1105+
print(f" {'-'*16}" + "".join(f"{'-'*10}" for _ in platforms_list) + f"{'-'*10}")
1106+
print(f" {'TOTAL':<16}" + "".join(
1107+
f"{sum(results_summary.get(t, {}).get(p, 0) for t in TOPIC_CATEGORIES):>10}"
1108+
for p in platforms_list
1109+
) + f"{grand_total:>10}")
1110+
1111+
# Rate limit after
1112+
data, _ = await client.get("https://api.github.com/rate_limit")
1113+
if data:
1114+
core = data.get("resources", {}).get("core", {})
1115+
search = data.get("resources", {}).get("search", {})
1116+
print(f"\n Rate limit remaining — core: {core.get('remaining', '?')}/{core.get('limit', '?')}, "
1117+
f"search: {search.get('remaining', '?')}/{search.get('limit', '?')}")
10221118

10231119

10241120
# ─── Cache I/O ─────────────────────────────────────────────────────────────────

0 commit comments

Comments
 (0)