@@ -897,68 +897,60 @@ async def fetch_most_popular(client: GitHubClient, platform: str, budget: Option
897897}
898898
899899
900- async def fetch_topic (
900+ async def search_topic_candidates (
901901 client : GitHubClient ,
902- platform : str ,
903902 topic_name : str ,
904903 topic_config : Dict ,
905- budget : Optional [int ] = None ,
906- ) -> List [Dict ]:
907- """Fetch repos matching a topic category for a platform."""
904+ ) -> List [RepoCandidate ]:
905+ """Search GitHub for repos matching a topic category (platform-agnostic).
906+
907+ Returns unverified candidates — call verify_installers() per platform.
908+ """
908909 print (f"\n { '=' * 60 } " )
909- print (f"TOPIC: { topic_name .upper ()} — { platform . upper () } " )
910+ print (f"TOPIC SEARCH : { topic_name .upper ()} (all platforms) " )
910911 print (f"{ '=' * 60 } " )
911912
912913 topics = topic_config ["topics" ]
913914 keywords = topic_config ["keywords" ]
914- platform_topics = PLATFORMS [platform ]["topics" ]
915- all_langs = PLATFORMS [platform ]["languages" ]["primary" ] + PLATFORMS [platform ]["languages" ]["secondary" ]
916915 seen : Set [str ] = set ()
917916
918917 one_year = (datetime .utcnow () - timedelta (days = 365 )).strftime ("%Y-%m-%d" )
919918 two_years = (datetime .utcnow () - timedelta (days = 730 )).strftime ("%Y-%m-%d" )
920919
921920 specs = []
922921
923- # Strategy: fewer, broader queries to stay within search rate limits.
924- # Each query uses OR to combine multiple category topics, reducing total
925- # search API calls from ~78 to ~18 per topic×platform.
922+ # Strategy: search by CATEGORY TOPICS ONLY — do NOT cross with platform topics.
923+ # Almost no repos tag themselves with both "terminal" AND "windows".
924+ # Platform filtering happens at the installer-verification step (checks for
925+ # .apk, .exe, .dmg, .deb etc. in release assets), which runs per-platform.
926926
927- # 1) Batch category topics into groups of 4, cross with platform topics
928- # e.g. "(topic:terminal OR topic:developer-tools OR topic:cli OR topic:ide)"
929- # AND "(topic:android OR topic:android-app)"
930- # Using AND between groups (implicit) so results must match BOTH category AND platform.
931- # Note: search_repos() prepends "fork:true" automatically.
927+ # 1) Batch category topics into groups of 4, sorted by stars
932928 topic_batches = [topics [i :i + 4 ] for i in range (0 , min (len (topics ), 12 ), 4 )]
933929 for batch in topic_batches :
934- base = f"stars:>10 archived:false pushed:>={ one_year } "
930+ base = f"stars:>50 archived:false pushed:>={ one_year } "
935931 cat_or = " OR " .join (f"topic:{ t } " for t in batch )
936- plat_or = " OR " .join (f"topic:{ t } " for t in platform_topics [:2 ])
937932 specs .append ({
938- "query" : f"{ base } ({ cat_or } ) ( { plat_or } ) " ,
933+ "query" : f"{ base } ({ cat_or } )" ,
939934 "sort" : "stars" , "pages" : 3 , "weight" : 1.5 ,
940935 })
941936
942- # 2) Keywords in name/description + platform topic (top 3 keywords)
937+ # 2) Keywords in name/description
943938 for kw in keywords [:3 ]:
944- base = f"stars:>20 archived:false pushed:>={ one_year } "
945- specs .append ({
946- "query" : _build_query (base , topics = platform_topics [:2 ], description_kw = kw ),
947- "sort" : "stars" , "pages" : 3 , "weight" : 1.2 ,
948- })
949-
950- # 3) All category topics combined + primary language (catches repos without platform topic)
951- # Single query per language instead of per-batch to reduce total API calls.
952- cat_lang_or = " OR " .join (f"topic:{ t } " for t in topics [:8 ])
953- for lang in all_langs [:2 ]:
954- base = f"stars:>20 archived:false pushed:>={ one_year } "
939+ base = f"stars:>100 archived:false pushed:>={ one_year } "
955940 specs .append ({
956- "query" : f"{ base } ( { cat_lang_or } ) language: { lang } " ,
957- "sort" : "stars" , "pages" : 2 , "weight" : 1.0 ,
941+ "query" : f"{ base } { kw } in:name,description " ,
942+ "sort" : "stars" , "pages" : 2 , "weight" : 1.2 ,
958943 })
959944
960- # 4) Broader: all category topics combined, high stars (platform-agnostic)
945+ # 3) All category topics combined, lower star threshold, recently updated
961946 cat_all_or = " OR " .join (f"topic:{ t } " for t in topics [:8 ])
947+ base = f"stars:>20 archived:false pushed:>={ one_year } "
948+ specs .append ({
949+ "query" : f"{ base } ({ cat_all_or } )" ,
950+ "sort" : "updated" , "pages" : 3 , "weight" : 1.0 ,
951+ })
952+
953+ # 4) Broader: high stars, wider time window (catches established projects)
962954 base = f"stars:>500 archived:false pushed:>={ two_years } "
963955 specs .append ({
964956 "query" : f"{ base } ({ cat_all_or } )" ,
@@ -967,46 +959,110 @@ async def fetch_topic(
967959
968960 print (f" { len (specs )} search specs ({ sum (s .get ('pages' , 3 ) for s in specs )} API calls)" )
969961
962+ # Use "android" as dummy platform for make_candidate scoring — score is
963+ # recalculated in verify_installers anyway, and we just need dedup here.
970964 candidates = await _collect_candidates (
971- client , specs , platform , seen , compute_velocity = False , min_score = 0 ,
965+ client , specs , "android" , seen , compute_velocity = False , min_score = 0 ,
972966 )
973967 print (f" { len (candidates )} candidates from { len (seen )} unique repos" )
968+ candidates .sort (key = lambda c : c .stars , reverse = True )
969+ return candidates
974970
975- # Sort by score (platform relevance) then stars
976- candidates .sort (key = lambda c : (c .score , c .stars ), reverse = True )
977- verified = await verify_installers (client , candidates , platform , need_release_date = True , budget = budget )
978971
972+ async def verify_topic_for_platform (
973+ client : GitHubClient ,
974+ candidates : List [RepoCandidate ],
975+ platform : str ,
976+ topic_name : str ,
977+ budget : Optional [int ] = None ,
978+ ) -> List [Dict ]:
979+ """Verify which candidates have installers for a specific platform."""
980+ print (f"\n --- { topic_name } /{ platform } : verifying { len (candidates )} candidates ---" )
981+ verified = await verify_installers (
982+ client , candidates , platform , need_release_date = True , budget = budget ,
983+ )
979984 verified .sort (key = lambda c : (c .score , c .stars ), reverse = True )
980985 print (f" ✓ { len (verified )} { topic_name } repos verified for { platform } " )
981986 return [r .to_summary ("topic" ) for r in verified ]
982987
983988
984989async def process_topics (client : GitHubClient , timestamp : str ):
985- """Process all 5 topic categories across all platforms."""
990+ """Process all 5 topic categories across all platforms.
991+
992+ Optimization: search queries are platform-agnostic, so we search ONCE per
993+ topic and then verify installers per-platform. This saves ~75% of search
994+ API calls compared to searching per topic×platform.
995+ """
986996 print (f"\n { '#' * 70 } " )
987997 print (f"# TOPICS (5 categories × { len (PLATFORMS )} platforms)" )
988998 print (f"{ '#' * 70 } " )
989999
1000+ # ─── Rate limit check ─────────────────────────────────────────────────
1001+ data , _ = await client .get ("https://api.github.com/rate_limit" )
1002+ if data :
1003+ core = data .get ("resources" , {}).get ("core" , {})
1004+ search = data .get ("resources" , {}).get ("search" , {})
1005+ print (f" Rate limit — core: { core .get ('remaining' , '?' )} /{ core .get ('limit' , '?' )} , "
1006+ f"search: { search .get ('remaining' , '?' )} /{ search .get ('limit' , '?' )} " )
1007+ remaining = core .get ("remaining" , 0 )
1008+ if remaining < 500 :
1009+ print (f" ⚠ WARNING: Low core rate limit ({ remaining } ) — results may be incomplete" )
1010+ search_remaining = search .get ("remaining" , 0 )
1011+ if search_remaining < 5 :
1012+ print (f" ⚠ WARNING: Search rate limit nearly exhausted ({ search_remaining } )" )
1013+
9901014 num_topics = len (TOPIC_CATEGORIES )
9911015 num_platforms = len (PLATFORMS )
992- total_slots = num_topics * num_platforms
1016+ # Budget slots: 1 search + N platform verifications per topic
1017+ total_verify_slots = num_topics * num_platforms
1018+
1019+ # Track results for final summary
1020+ results_summary : Dict [str , Dict [str , int ]] = {}
9931021
9941022 for topic_idx , (topic_name , topic_config ) in enumerate (TOPIC_CATEGORIES .items ()):
9951023 print (f"\n --- Topic { topic_idx + 1 } /{ num_topics } : { topic_name } ---" )
1024+ results_summary [topic_name ] = {}
9961025
997- for platform_idx , platform in enumerate (PLATFORMS .keys ()):
998- slot = topic_idx * num_platforms + platform_idx
999- slots_remaining = total_slots - slot
1000- budget = max ((client ._rate_remaining - RATE_LIMIT_FLOOR ) // slots_remaining , 50 )
1001-
1026+ # Check which platforms still need data
1027+ platforms_needed = []
1028+ for platform in PLATFORMS .keys ():
10021029 cached = load_cache (f"topics/{ topic_name } " , platform )
10031030 if cached :
1004- continue
1031+ # Count existing cached repos for summary
1032+ existing = _load_existing_count (f"topics/{ topic_name } " , platform )
1033+ results_summary [topic_name ][platform ] = existing
1034+ print (f" { platform } : cached ({ existing } repos)" )
1035+ else :
1036+ platforms_needed .append (platform )
1037+
1038+ if not platforms_needed :
1039+ print (f" All platforms cached — skipping" )
1040+ continue
1041+
1042+ # Search once for this topic (platform-agnostic)
1043+ candidates = await search_topic_candidates (client , topic_name , topic_config )
10051044
1006- repos = await fetch_topic (client , platform , topic_name , topic_config , budget )
1045+ if not candidates :
1046+ print (f" ⚠ 0 candidates found — skipping all platforms" )
1047+ for platform in platforms_needed :
1048+ existing = _load_existing_count (f"topics/{ topic_name } " , platform )
1049+ results_summary [topic_name ][platform ] = existing
1050+ print (f" { platform } : existing cache has { existing } repos" )
1051+ continue
1052+
1053+ # Verify installers per platform using the shared candidate list
1054+ for platform_idx , platform in enumerate (platforms_needed ):
1055+ verify_slot = topic_idx * num_platforms + platform_idx
1056+ slots_remaining = max (total_verify_slots - verify_slot , 1 )
1057+ budget = max ((client ._rate_remaining - RATE_LIMIT_FLOOR ) // slots_remaining , 50 )
1058+
1059+ repos = await verify_topic_for_platform (
1060+ client , candidates , platform , topic_name , budget ,
1061+ )
10071062
10081063 if len (repos ) == 0 :
10091064 existing = _load_existing_count (f"topics/{ topic_name } " , platform )
1065+ results_summary [topic_name ][platform ] = existing
10101066 print (f" ⚠ 0 repos fetched — skipping save (existing cache: { existing } repos)" )
10111067 continue
10121068
@@ -1015,10 +1071,50 @@ async def process_topics(client: GitHubClient, timestamp: str):
10151071 if len (repos ) < min_threshold :
10161072 existing = _load_existing_count (f"topics/{ topic_name } " , platform )
10171073 if existing >= min_threshold :
1074+ results_summary [topic_name ][platform ] = existing
10181075 print (f" ⚠ Only { len (repos )} repos fetched but cache has { existing } — keeping cached data" )
10191076 continue
10201077
10211078 save_data (f"topics/{ topic_name } " , platform , repos , timestamp )
1079+ results_summary [topic_name ][platform ] = len (repos )
1080+
1081+ # ─── Final summary ────────────────────────────────────────────────────
1082+ print (f"\n { '=' * 70 } " )
1083+ print (f"TOPICS SUMMARY" )
1084+ print (f"{ '=' * 70 } " )
1085+
1086+ # Header
1087+ platforms_list = list (PLATFORMS .keys ())
1088+ header = f" { 'topic' :<16} " + "" .join (f"{ p :>10} " for p in platforms_list ) + f"{ 'total' :>10} "
1089+ print (header )
1090+ print (f" { '-' * 16 } " + "" .join (f"{ '-' * 10 } " for _ in platforms_list ) + f"{ '-' * 10 } " )
1091+
1092+ grand_total = 0
1093+ for topic_name in TOPIC_CATEGORIES :
1094+ counts = results_summary .get (topic_name , {})
1095+ row = f" { topic_name :<16} "
1096+ topic_total = 0
1097+ for p in platforms_list :
1098+ c = counts .get (p , 0 )
1099+ topic_total += c
1100+ row += f"{ c :>10} "
1101+ row += f"{ topic_total :>10} "
1102+ grand_total += topic_total
1103+ print (row )
1104+
1105+ print (f" { '-' * 16 } " + "" .join (f"{ '-' * 10 } " for _ in platforms_list ) + f"{ '-' * 10 } " )
1106+ print (f" { 'TOTAL' :<16} " + "" .join (
1107+ f"{ sum (results_summary .get (t , {}).get (p , 0 ) for t in TOPIC_CATEGORIES ):>10} "
1108+ for p in platforms_list
1109+ ) + f"{ grand_total :>10} " )
1110+
1111+ # Rate limit after
1112+ data , _ = await client .get ("https://api.github.com/rate_limit" )
1113+ if data :
1114+ core = data .get ("resources" , {}).get ("core" , {})
1115+ search = data .get ("resources" , {}).get ("search" , {})
1116+ print (f"\n Rate limit remaining — core: { core .get ('remaining' , '?' )} /{ core .get ('limit' , '?' )} , "
1117+ f"search: { search .get ('remaining' , '?' )} /{ search .get ('limit' , '?' )} " )
10221118
10231119
10241120# ─── Cache I/O ─────────────────────────────────────────────────────────────────
0 commit comments