Skip to content

Commit 1836b93

Browse files
smoparthclaude
andcommitted
feat(graph): add suggest-collection command for collection optimization
Add fromager graph suggest-collection that analyzes dependency overlap between onboarding packages and existing collections to recommend the best-fit collection for each package. Helps pipeline maintainers make data-driven decisions when assigning new packages to permanent collections. - Iterative DFS traversal over all edge types for full dependency closure - Ranks collections by fewest new packages, then highest coverage - Rich table (default) and JSON output formats - Error handling for bad graph files and duplicate collection names - Tests covering helpers, CLI, and analysis logic Co-Authored-By: Claude-4.6-opus-high <claude@anthropic.com> Closes: #971 Signed-off-by: Shanmukh Pawan <smoparth@redhat.com>
1 parent b5df8e2 commit 1836b93

File tree

2 files changed

+765
-1
lines changed

2 files changed

+765
-1
lines changed

src/fromager/commands/graph.py

Lines changed: 289 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@
77
import typing
88

99
import click
10+
import rich
11+
import rich.box
1012
from packaging.requirements import Requirement
11-
from packaging.utils import canonicalize_name
13+
from packaging.utils import NormalizedName, canonicalize_name
1214
from packaging.version import Version
15+
from rich.table import Table
1316

1417
from fromager import clickext, context
1518
from fromager.commands import bootstrap
@@ -784,3 +787,288 @@ def n2s(nodes: typing.Iterable[DependencyNode]) -> str:
784787
topo.done(*nodes_to_build)
785788

786789
print(f"\nBuilding {len(graph)} packages in {rounds} rounds.")
790+
791+
792+
def _traverse_edges(
793+
node: DependencyNode,
794+
) -> typing.Iterable[DependencyNode]:
795+
"""Iterative DFS over all of a node's dependency edges.
796+
797+
Yields every reachable ``DependencyNode`` exactly once, excluding the
798+
starting *node* itself. All edge types are followed (install, build,
799+
toplevel).
800+
801+
Args:
802+
node: The starting node (not yielded).
803+
804+
Yields:
805+
Each reachable DependencyNode, in DFS order.
806+
"""
807+
visited: set[str] = {node.key}
808+
stack: list[DependencyNode] = [node]
809+
while stack:
810+
current = stack.pop()
811+
for edge in current.children:
812+
child_node = edge.destination_node
813+
if child_node.key in visited:
814+
continue
815+
visited.add(child_node.key)
816+
yield child_node
817+
stack.append(child_node)
818+
819+
820+
def get_dependency_closure(node: DependencyNode) -> set[NormalizedName]:
821+
"""Compute the full dependency closure for a node.
822+
823+
Traverses all edge types and returns the set of canonical package names
824+
reachable from *node*, including *node* itself.
825+
826+
Args:
827+
node: The starting node to compute the closure for.
828+
829+
Returns:
830+
Set of canonicalized package names in the transitive closure.
831+
"""
832+
dependency_names: set[NormalizedName] = set()
833+
if node.canonicalized_name != ROOT:
834+
dependency_names.add(node.canonicalized_name)
835+
for dependency in _traverse_edges(node):
836+
if dependency.canonicalized_name != ROOT:
837+
dependency_names.add(dependency.canonicalized_name)
838+
return dependency_names
839+
840+
841+
def get_package_names(graph: DependencyGraph) -> set[NormalizedName]:
842+
"""Extract all unique canonical package names from a graph.
843+
844+
Args:
845+
graph: The dependency graph to extract names from.
846+
847+
Returns:
848+
Set of canonicalized package names, excluding the ROOT node.
849+
"""
850+
return {
851+
node.canonicalized_name for node in graph.get_all_nodes() if node.key != ROOT
852+
}
853+
854+
855+
def extract_collection_name(graph_path: str) -> str:
856+
"""Derive a collection name from a graph file path.
857+
858+
Uses the filename stem, stripping a trailing ``-graph`` suffix when present.
859+
For example, ``notebook-graph.json`` becomes ``notebook``.
860+
861+
Args:
862+
graph_path: Filesystem path to a graph JSON file.
863+
864+
Returns:
865+
A short collection name string.
866+
"""
867+
stem = pathlib.PurePath(graph_path).stem
868+
if stem.endswith("-graph"):
869+
stem = stem[: -len("-graph")]
870+
return stem
871+
872+
873+
class _CollectionScore(typing.NamedTuple):
874+
"""Overlap score between a package's dependency closure and a collection."""
875+
876+
collection: str
877+
new_packages: int
878+
existing_packages: int
879+
coverage_percentage: float
880+
881+
882+
def _analyze_suggestions(
883+
toplevel_nodes: list[DependencyNode],
884+
collection_packages: dict[str, set[NormalizedName]],
885+
) -> list[dict[str, typing.Any]]:
886+
"""Score each onboarding top-level package against every collection.
887+
888+
Args:
889+
toplevel_nodes: Top-level nodes from the onboarding graph.
890+
collection_packages: Mapping of collection name to its package name set.
891+
892+
Returns:
893+
List of result dicts, one per top-level package, sorted by package name.
894+
"""
895+
results: list[dict[str, typing.Any]] = []
896+
897+
for node in sorted(toplevel_nodes, key=lambda n: n.canonicalized_name):
898+
dependency_names = get_dependency_closure(node)
899+
total_dependency_count = len(dependency_names)
900+
901+
scores: list[_CollectionScore] = []
902+
for collection_name, packages in collection_packages.items():
903+
existing_count = len(dependency_names & packages)
904+
new_count = total_dependency_count - existing_count
905+
coverage_percentage = (
906+
(existing_count / total_dependency_count * 100)
907+
if total_dependency_count
908+
else 0.0
909+
)
910+
scores.append(
911+
_CollectionScore(
912+
collection_name, new_count, existing_count, coverage_percentage
913+
)
914+
)
915+
916+
# Rank: fewest new packages, then highest coverage, then name for determinism
917+
scores.sort(
918+
key=lambda score: (
919+
score.new_packages,
920+
-score.coverage_percentage,
921+
score.collection,
922+
)
923+
)
924+
best_score = scores[0] if scores else None
925+
926+
logger.debug(
927+
"%s: %d deps, best fit '%s' (%d new, %.1f%% coverage)",
928+
node.canonicalized_name,
929+
total_dependency_count,
930+
best_score.collection if best_score else "none",
931+
best_score.new_packages if best_score else 0,
932+
best_score.coverage_percentage if best_score else 0.0,
933+
)
934+
935+
results.append(
936+
{
937+
"package": str(node.canonicalized_name),
938+
"version": str(node.version),
939+
"total_dependencies": total_dependency_count,
940+
"best_fit": best_score.collection if best_score else "none",
941+
"new_packages": best_score.new_packages if best_score else 0,
942+
"existing_packages": best_score.existing_packages if best_score else 0,
943+
"coverage_percentage": round(best_score.coverage_percentage, 1)
944+
if best_score
945+
else 0.0,
946+
"all_collections": [
947+
{
948+
"collection": score.collection,
949+
"new_packages": score.new_packages,
950+
"existing_packages": score.existing_packages,
951+
"coverage_percentage": round(score.coverage_percentage, 1),
952+
}
953+
for score in scores
954+
],
955+
}
956+
)
957+
958+
return results
959+
960+
961+
def _print_suggest_collection_table(
962+
results: list[dict[str, typing.Any]],
963+
) -> None:
964+
"""Render suggest-collection results as a Rich table."""
965+
table = Table(
966+
title="Collection Suggestions for Onboarding Packages",
967+
box=rich.box.MARKDOWN,
968+
title_justify="left",
969+
)
970+
table.add_column("Package", justify="left", no_wrap=True)
971+
table.add_column("Version", justify="left", no_wrap=True)
972+
table.add_column("Total Deps", justify="right", no_wrap=True)
973+
table.add_column("Best Fit", justify="left", no_wrap=True)
974+
table.add_column("New Pkgs", justify="right", no_wrap=True)
975+
table.add_column("Existing", justify="right", no_wrap=True)
976+
table.add_column("Coverage", justify="right", no_wrap=True)
977+
978+
for result in results:
979+
table.add_row(
980+
result["package"],
981+
result["version"],
982+
str(result["total_dependencies"]),
983+
result["best_fit"],
984+
str(result["new_packages"]),
985+
str(result["existing_packages"]),
986+
f"{result['coverage_percentage']:.1f}%",
987+
)
988+
989+
rich.get_console().print(table)
990+
991+
992+
@graph.command(name="suggest-collection")
993+
@click.option(
994+
"--format",
995+
"output_format",
996+
type=click.Choice(["table", "json"], case_sensitive=False),
997+
default="table",
998+
help="Output format (default: table)",
999+
)
1000+
@click.argument("onboarding-graph", type=str)
1001+
@click.argument("collection-graphs", nargs=-1, required=True, type=str)
1002+
def suggest_collection(
1003+
output_format: str,
1004+
onboarding_graph: str,
1005+
collection_graphs: tuple[str, ...],
1006+
) -> None:
1007+
"""Suggest the best-fit collection for each onboarding package.
1008+
1009+
Analyzes dependency overlap between top-level packages in ONBOARDING_GRAPH
1010+
and the existing COLLECTION_GRAPHS to recommend where each onboarding
1011+
package should be placed.
1012+
1013+
For each top-level package in the onboarding graph, computes the full
1014+
transitive dependency closure and compares it against every collection.
1015+
Collections are ranked by fewest new packages required, then by highest
1016+
dependency coverage.
1017+
1018+
\b
1019+
ONBOARDING_GRAPH Path to the onboarding collection graph.json.
1020+
COLLECTION_GRAPHS One or more paths to existing collection graph.json files.
1021+
"""
1022+
try:
1023+
onboarding = DependencyGraph.from_file(onboarding_graph)
1024+
except Exception as err:
1025+
raise click.ClickException(
1026+
f"Failed to load onboarding graph {onboarding_graph}: {err}"
1027+
) from err
1028+
1029+
root = onboarding.get_root_node()
1030+
1031+
toplevel_nodes: list[DependencyNode] = [
1032+
edge.destination_node
1033+
for edge in root.children
1034+
if edge.req_type == RequirementType.TOP_LEVEL
1035+
]
1036+
1037+
if not toplevel_nodes:
1038+
if output_format == "json":
1039+
click.echo("[]")
1040+
else:
1041+
click.echo("No top-level packages found in onboarding graph.")
1042+
return
1043+
1044+
logger.info(
1045+
"Loaded onboarding graph with %d top-level packages", len(toplevel_nodes)
1046+
)
1047+
1048+
collection_packages: dict[str, set[NormalizedName]] = {}
1049+
for graph_path in collection_graphs:
1050+
collection_name = extract_collection_name(graph_path)
1051+
if collection_name in collection_packages:
1052+
raise click.ClickException(
1053+
f"Duplicate collection name '{collection_name}' from {graph_path}. "
1054+
"Rename one of the graph files to avoid ambiguity."
1055+
)
1056+
try:
1057+
collection_graph = DependencyGraph.from_file(graph_path)
1058+
except Exception as err:
1059+
raise click.ClickException(
1060+
f"Failed to load collection graph {graph_path}: {err}"
1061+
) from err
1062+
collection_packages[collection_name] = get_package_names(collection_graph)
1063+
logger.debug(
1064+
"Collection '%s': %d packages",
1065+
collection_name,
1066+
len(collection_packages[collection_name]),
1067+
)
1068+
1069+
results = _analyze_suggestions(toplevel_nodes, collection_packages)
1070+
1071+
if output_format == "json":
1072+
click.echo(json.dumps(results, indent=2))
1073+
else:
1074+
_print_suggest_collection_table(results)

0 commit comments

Comments
 (0)