|
24 | 24 | "trending": os.environ.get("GH_TOKEN_TRENDING"), |
25 | 25 | "new-releases": os.environ.get("GH_TOKEN_NEW_RELEASES"), |
26 | 26 | "most-popular": os.environ.get("GH_TOKEN_MOST_POPULAR"), |
| 27 | + "topics": os.environ.get("GH_TOKEN_TOPICS"), |
27 | 28 | } |
28 | 29 | FALLBACK_TOKEN = os.environ.get("GITHUB_TOKEN") |
29 | 30 |
|
30 | 31 | # Ensure at least one token is available |
31 | 32 | _any_token = any(CATEGORY_TOKENS.values()) or FALLBACK_TOKEN |
32 | 33 | if not _any_token: |
33 | 34 | print("ERROR: No GitHub tokens set. Set GH_TOKEN_TRENDING / GH_TOKEN_NEW_RELEASES / " |
34 | | - "GH_TOKEN_MOST_POPULAR, or GITHUB_TOKEN as fallback.", file=sys.stderr) |
| 35 | + "GH_TOKEN_MOST_POPULAR / GH_TOKEN_TOPICS, or GITHUB_TOKEN as fallback.", file=sys.stderr) |
35 | 36 | sys.exit(1) |
36 | 37 |
|
37 | 38 |
|
@@ -848,6 +849,165 @@ async def fetch_most_popular(client: GitHubClient, platform: str, budget: Option |
848 | 849 | return [r.to_summary("most-popular") for r in verified] |
849 | 850 |
|
850 | 851 |
|
| 852 | +# ─── Topic definitions ───────────────────────────────────────────────────── |
| 853 | + |
| 854 | +# Maps topic category name → list of GitHub topics to search for. |
| 855 | +# Mirrors TopicCategory enum in the app (feature/home/domain/model/TopicCategory.kt). |
| 856 | +TOPIC_CATEGORIES = { |
| 857 | + "privacy": { |
| 858 | + "topics": [ |
| 859 | + "privacy", "security", "encryption", "vpn", "firewall", |
| 860 | + "password-manager", "privacy-tools", "e2ee", "secure", |
| 861 | + "anonymity", "tor", "pgp", "2fa", "auth", |
| 862 | + ], |
| 863 | + "keywords": ["privacy", "security", "encryption", "vpn", "firewall"], |
| 864 | + }, |
| 865 | + "media": { |
| 866 | + "topics": [ |
| 867 | + "music-player", "video-player", "media", "podcast", "streaming", |
| 868 | + "audio", "video", "media-player", "music", "player", |
| 869 | + "mpv", "vlc", "recorder", "screen-recorder", "gallery", |
| 870 | + ], |
| 871 | + "keywords": ["music-player", "video-player", "media", "podcast", "audio"], |
| 872 | + }, |
| 873 | + "productivity": { |
| 874 | + "topics": [ |
| 875 | + "productivity", "file-manager", "notes", "launcher", "keyboard", |
| 876 | + "browser", "calendar", "todo", "note-taking", "editor", |
| 877 | + "organizer", "task-manager", "markdown", "writing", |
| 878 | + ], |
| 879 | + "keywords": ["productivity", "file-manager", "notes", "launcher", "browser"], |
| 880 | + }, |
| 881 | + "networking": { |
| 882 | + "topics": [ |
| 883 | + "proxy", "dns", "ad-blocker", "torrent", "downloader", |
| 884 | + "network", "ssh", "wireguard", "adblock", "download-manager", |
| 885 | + "firewall", "socks5", "http-proxy", "p2p", "ftp", |
| 886 | + ], |
| 887 | + "keywords": ["proxy", "dns", "ad-blocker", "torrent", "downloader", "network"], |
| 888 | + }, |
| 889 | + "dev-tools": { |
| 890 | + "topics": [ |
| 891 | + "terminal", "developer-tools", "git-client", "editor", "cli", |
| 892 | + "ide", "devtools", "code-editor", "terminal-emulator", "development", |
| 893 | + "adb", "debugger", "api-client", "shell", "sdk", |
| 894 | + ], |
| 895 | + "keywords": ["terminal", "developer-tools", "git-client", "code-editor", "cli"], |
| 896 | + }, |
| 897 | +} |
| 898 | + |
| 899 | + |
| 900 | +async def fetch_topic( |
| 901 | + client: GitHubClient, |
| 902 | + platform: str, |
| 903 | + topic_name: str, |
| 904 | + topic_config: Dict, |
| 905 | + budget: Optional[int] = None, |
| 906 | +) -> List[Dict]: |
| 907 | + """Fetch repos matching a topic category for a platform.""" |
| 908 | + print(f"\n{'='*60}") |
| 909 | + print(f"TOPIC: {topic_name.upper()} — {platform.upper()}") |
| 910 | + print(f"{'='*60}") |
| 911 | + |
| 912 | + topics = topic_config["topics"] |
| 913 | + keywords = topic_config["keywords"] |
| 914 | + platform_topics = PLATFORMS[platform]["topics"] |
| 915 | + all_langs = PLATFORMS[platform]["languages"]["primary"] + PLATFORMS[platform]["languages"]["secondary"] |
| 916 | + seen: Set[str] = set() |
| 917 | + |
| 918 | + one_year = (datetime.utcnow() - timedelta(days=365)).strftime("%Y-%m-%d") |
| 919 | + two_years = (datetime.utcnow() - timedelta(days=730)).strftime("%Y-%m-%d") |
| 920 | + |
| 921 | + specs = [] |
| 922 | + |
| 923 | + # 1) Topic-based searches: cross topic categories with platform topics |
| 924 | + # e.g. "topic:privacy topic:android" — most relevant results |
| 925 | + for topic in topics[:8]: # top 8 topics to avoid too many queries |
| 926 | + base = f"stars:>10 archived:false pushed:>={one_year}" |
| 927 | + specs.append({ |
| 928 | + "query": _build_query(base, topics=[topic] + platform_topics[:2]), |
| 929 | + "sort": "stars", "pages": 3, "weight": 1.5, |
| 930 | + }) |
| 931 | + |
| 932 | + # 2) Keyword in description/name + platform topic |
| 933 | + for kw in keywords[:3]: |
| 934 | + base = f"stars:>20 archived:false pushed:>={one_year}" |
| 935 | + specs.append({ |
| 936 | + "query": _build_query(base, topics=platform_topics[:2], description_kw=kw), |
| 937 | + "sort": "stars", "pages": 3, "weight": 1.2, |
| 938 | + }) |
| 939 | + |
| 940 | + # 3) Topic + primary language (catches repos without platform topic) |
| 941 | + for topic in topics[:5]: |
| 942 | + for lang in all_langs[:2]: |
| 943 | + base = f"stars:>20 archived:false pushed:>={one_year}" |
| 944 | + specs.append({ |
| 945 | + "query": _build_query(base, topics=[topic], language=lang), |
| 946 | + "sort": "stars", "pages": 3, "weight": 1.0, |
| 947 | + }) |
| 948 | + |
| 949 | + # 4) Broader: topic only, high stars (platform-agnostic gems) |
| 950 | + for topic in topics[:5]: |
| 951 | + base = f"stars:>500 archived:false pushed:>={two_years}" |
| 952 | + specs.append({ |
| 953 | + "query": _build_query(base, topics=[topic]), |
| 954 | + "sort": "stars", "pages": 3, "weight": 0.8, |
| 955 | + }) |
| 956 | + |
| 957 | + candidates = await _collect_candidates( |
| 958 | + client, specs, platform, seen, compute_velocity=False, min_score=0, |
| 959 | + ) |
| 960 | + print(f" {len(candidates)} candidates from {len(seen)} unique repos") |
| 961 | + |
| 962 | + # Sort by score (platform relevance) then stars |
| 963 | + candidates.sort(key=lambda c: (c.score, c.stars), reverse=True) |
| 964 | + verified = await verify_installers(client, candidates, platform, need_release_date=True, budget=budget) |
| 965 | + |
| 966 | + verified.sort(key=lambda c: (c.score, c.stars), reverse=True) |
| 967 | + print(f" ✓ {len(verified)} {topic_name} repos verified for {platform}") |
| 968 | + return [r.to_summary("topic") for r in verified] |
| 969 | + |
| 970 | + |
| 971 | +async def process_topics(client: GitHubClient, timestamp: str): |
| 972 | + """Process all 5 topic categories across all platforms.""" |
| 973 | + print(f"\n{'#'*70}") |
| 974 | + print(f"# TOPICS (5 categories × {len(PLATFORMS)} platforms)") |
| 975 | + print(f"{'#'*70}") |
| 976 | + |
| 977 | + num_topics = len(TOPIC_CATEGORIES) |
| 978 | + num_platforms = len(PLATFORMS) |
| 979 | + total_slots = num_topics * num_platforms |
| 980 | + |
| 981 | + for topic_idx, (topic_name, topic_config) in enumerate(TOPIC_CATEGORIES.items()): |
| 982 | + print(f"\n--- Topic {topic_idx + 1}/{num_topics}: {topic_name} ---") |
| 983 | + |
| 984 | + for platform_idx, platform in enumerate(PLATFORMS.keys()): |
| 985 | + slot = topic_idx * num_platforms + platform_idx |
| 986 | + slots_remaining = total_slots - slot |
| 987 | + budget = max((client._rate_remaining - RATE_LIMIT_FLOOR) // slots_remaining, 50) |
| 988 | + |
| 989 | + cached = load_cache(f"topics/{topic_name}", platform) |
| 990 | + if cached: |
| 991 | + continue |
| 992 | + |
| 993 | + repos = await fetch_topic(client, platform, topic_name, topic_config, budget) |
| 994 | + |
| 995 | + if len(repos) == 0: |
| 996 | + existing = _load_existing_count(f"topics/{topic_name}", platform) |
| 997 | + print(f" ⚠ 0 repos fetched — skipping save (existing cache: {existing} repos)") |
| 998 | + continue |
| 999 | + |
| 1000 | + # Lower threshold for topics — even 5 repos is useful |
| 1001 | + min_threshold = 5 |
| 1002 | + if len(repos) < min_threshold: |
| 1003 | + existing = _load_existing_count(f"topics/{topic_name}", platform) |
| 1004 | + if existing >= min_threshold: |
| 1005 | + print(f" ⚠ Only {len(repos)} repos fetched but cache has {existing} — keeping cached data") |
| 1006 | + continue |
| 1007 | + |
| 1008 | + save_data(f"topics/{topic_name}", platform, repos, timestamp) |
| 1009 | + |
| 1010 | + |
851 | 1011 | # ─── Cache I/O ───────────────────────────────────────────────────────────────── |
852 | 1012 |
|
853 | 1013 |
|
@@ -1009,6 +1169,81 @@ async def main(): |
1009 | 1169 | f"{len(client.release_cache)} release cache entries, " |
1010 | 1170 | f"{client._rate_remaining} requests remaining") |
1011 | 1171 |
|
| 1172 | + # ─── Phase 2: Topics ───────────────────────────────────────────────────── |
| 1173 | + # Use GH_TOKEN_TOPICS (dedicated) plus scavenge leftover budget from |
| 1174 | + # the 3 category tokens. |
| 1175 | + |
| 1176 | + topics_token = CATEGORY_TOKENS.get("topics") or FALLBACK_TOKEN |
| 1177 | + if topics_token: |
| 1178 | + print(f"\n{'#'*70}") |
| 1179 | + print(f"# PHASE 2: TOPICS") |
| 1180 | + print(f"{'#'*70}") |
| 1181 | + |
| 1182 | + # Collect all usable tokens: dedicated topics token + leftover from category tokens |
| 1183 | + topic_tokens = [topics_token] |
| 1184 | + for cat_name in ["trending", "new-releases", "most-popular"]: |
| 1185 | + cat_token = CATEGORY_TOKENS.get(cat_name) |
| 1186 | + if cat_token and cat_token != topics_token: |
| 1187 | + topic_tokens.append(cat_token) |
| 1188 | + |
| 1189 | + # Deduplicate while preserving order |
| 1190 | + seen_tokens: Set[str] = set() |
| 1191 | + unique_tokens = [] |
| 1192 | + for t in topic_tokens: |
| 1193 | + if t not in seen_tokens: |
| 1194 | + seen_tokens.add(t) |
| 1195 | + unique_tokens.append(t) |
| 1196 | + |
| 1197 | + print(f" Using {len(unique_tokens)} token(s) for topics") |
| 1198 | + |
| 1199 | + # Check remaining budget on each token |
| 1200 | + usable_clients = [] |
| 1201 | + for i, tok in enumerate(unique_tokens): |
| 1202 | + async with GitHubClient(tok) as probe: |
| 1203 | + data, _ = await probe.get("https://api.github.com/rate_limit") |
| 1204 | + if data: |
| 1205 | + remaining = data.get("resources", {}).get("core", {}).get("remaining", 0) |
| 1206 | + limit = data.get("resources", {}).get("core", {}).get("limit", 0) |
| 1207 | + label = "dedicated" if i == 0 else f"leftover-{i}" |
| 1208 | + print(f" Token {label}: {remaining}/{limit} remaining") |
| 1209 | + if remaining > RATE_LIMIT_FLOOR + 100: |
| 1210 | + usable_clients.append((tok, remaining)) |
| 1211 | + |
| 1212 | + if usable_clients: |
| 1213 | + # Use the token with the most remaining budget first |
| 1214 | + usable_clients.sort(key=lambda x: x[1], reverse=True) |
| 1215 | + best_token, best_remaining = usable_clients[0] |
| 1216 | + print(f" Selected token with {best_remaining} remaining requests") |
| 1217 | + |
| 1218 | + async with GitHubClient(best_token) as client: |
| 1219 | + client._rate_remaining = best_remaining |
| 1220 | + await process_topics(client, timestamp) |
| 1221 | + total_requests += client._request_count |
| 1222 | + total_cache_entries += len(client.release_cache) |
| 1223 | + print(f" [topics] Used {client._request_count} API requests, " |
| 1224 | + f"{len(client.release_cache)} release cache entries, " |
| 1225 | + f"{client._rate_remaining} requests remaining") |
| 1226 | + |
| 1227 | + # If first token ran low, try remaining tokens |
| 1228 | + if client._rate_remaining <= RATE_LIMIT_FLOOR + 50 and len(usable_clients) > 1: |
| 1229 | + for tok, rem in usable_clients[1:]: |
| 1230 | + print(f"\n Switching to next token ({rem} remaining)...") |
| 1231 | + async with GitHubClient(tok) as fallback_client: |
| 1232 | + fallback_client._rate_remaining = rem |
| 1233 | + # Share release cache for efficiency |
| 1234 | + fallback_client.release_cache = client.release_cache |
| 1235 | + await process_topics(fallback_client, timestamp) |
| 1236 | + total_requests += fallback_client._request_count |
| 1237 | + total_cache_entries += len(fallback_client.release_cache) |
| 1238 | + print(f" [topics-fallback] Used {fallback_client._request_count} API requests, " |
| 1239 | + f"{fallback_client._rate_remaining} remaining") |
| 1240 | + if fallback_client._rate_remaining > RATE_LIMIT_FLOOR + 50: |
| 1241 | + break # Still has budget, done |
| 1242 | + else: |
| 1243 | + print(" ⚠ No tokens with enough budget for topics — skipping") |
| 1244 | + else: |
| 1245 | + print("\n⚠ No token available for topics — set GH_TOKEN_TOPICS or GITHUB_TOKEN") |
| 1246 | + |
1012 | 1247 | elapsed = time.time() - start |
1013 | 1248 | print(f"\n{'='*70}") |
1014 | 1249 | print(f"✓ DONE in {elapsed:.0f}s ({elapsed/60:.1f} min)") |
|
0 commit comments