Skip to content

Commit 2ec1705

Browse files
smoparthclaude
andcommitted
feat(graph): add suggest-collection command for collection optimization
Add fromager graph suggest-collection that analyzes dependency overlap between onboarding packages and existing collections to recommend the best-fit collection for each package. Helps pipeline maintainers make data-driven decisions when assigning new packages to permanent collections. - Iterative DFS traversal over all edge types for full dependency closure - Ranks collections by fewest new packages, then highest coverage - Rich table (default) and JSON output formats - Error handling for bad graph files and duplicate collection names - Tests covering helpers, CLI, and analysis logic Co-Authored-By: Claude-4.6-opus-high <claude@anthropic.com> Closes: #971 Signed-off-by: Shanmukh Pawan <smoparth@redhat.com>
1 parent b5df8e2 commit 2ec1705

3 files changed

Lines changed: 725 additions & 1 deletion

File tree

src/fromager/commands/graph.py

Lines changed: 253 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@
77
import typing
88

99
import click
10+
import rich
11+
import rich.box
1012
from packaging.requirements import Requirement
11-
from packaging.utils import canonicalize_name
13+
from packaging.utils import NormalizedName, canonicalize_name
1214
from packaging.version import Version
15+
from rich.table import Table
1316

1417
from fromager import clickext, context
1518
from fromager.commands import bootstrap
@@ -784,3 +787,252 @@ def n2s(nodes: typing.Iterable[DependencyNode]) -> str:
784787
topo.done(*nodes_to_build)
785788

786789
print(f"\nBuilding {len(graph)} packages in {rounds} rounds.")
790+
791+
792+
def get_dependency_closure(node: DependencyNode) -> set[NormalizedName]:
793+
"""Compute the full dependency closure for a node.
794+
795+
Traverses all edge types and returns the set of canonical package names reachable from node,
796+
including node itself.
797+
798+
Args:
799+
node: The starting node to compute the closure for.
800+
801+
Returns:
802+
Set of canonicalized package names in the transitive closure.
803+
"""
804+
dependency_names: set[NormalizedName] = set()
805+
if node.canonicalized_name != ROOT:
806+
dependency_names.add(node.canonicalized_name)
807+
for dependency in node.iter_all_dependencies():
808+
if dependency.canonicalized_name != ROOT:
809+
dependency_names.add(dependency.canonicalized_name)
810+
return dependency_names
811+
812+
813+
def get_package_names(graph: DependencyGraph) -> set[NormalizedName]:
814+
"""Extract all unique canonical package names from a graph.
815+
816+
Args:
817+
graph: The dependency graph to extract names from.
818+
819+
Returns:
820+
Set of canonicalized package names, excluding the ROOT node.
821+
"""
822+
return {
823+
node.canonicalized_name for node in graph.get_all_nodes() if node.key != ROOT
824+
}
825+
826+
827+
def extract_collection_name(graph_path: str) -> str:
828+
"""Derive a collection name from a graph file path.
829+
830+
Returns the filename without the extension as a string.
831+
832+
Args:
833+
graph_path: Filesystem path to a graph JSON file.
834+
835+
Returns:
836+
The filename without the extension.
837+
"""
838+
return pathlib.PurePath(graph_path).stem
839+
840+
841+
class _CollectionScore(typing.NamedTuple):
842+
"""Overlap score between a package's dependency closure and a collection."""
843+
844+
collection: str
845+
new_packages: int
846+
existing_packages: int
847+
coverage_percentage: float
848+
849+
850+
def _analyze_suggestions(
851+
toplevel_nodes: list[DependencyNode],
852+
collection_packages: dict[str, set[NormalizedName]],
853+
) -> list[dict[str, typing.Any]]:
854+
"""Score each onboarding top-level package against every collection.
855+
856+
Args:
857+
toplevel_nodes: Top-level nodes from the onboarding graph.
858+
collection_packages: Mapping of collection name to its package name set.
859+
860+
Returns:
861+
List of result dicts, one per top-level package, sorted by package name.
862+
"""
863+
results: list[dict[str, typing.Any]] = []
864+
865+
for node in sorted(toplevel_nodes, key=lambda n: n.canonicalized_name):
866+
dependency_names = get_dependency_closure(node)
867+
total_dependency_count = len(dependency_names)
868+
869+
scores: list[_CollectionScore] = []
870+
for collection_name, packages in collection_packages.items():
871+
existing_count = len(dependency_names & packages)
872+
new_count = total_dependency_count - existing_count
873+
coverage_percentage = (
874+
(existing_count / total_dependency_count * 100)
875+
if total_dependency_count
876+
else 0.0
877+
)
878+
scores.append(
879+
_CollectionScore(
880+
collection_name, new_count, existing_count, coverage_percentage
881+
)
882+
)
883+
884+
# Rank: fewest new packages, then highest coverage, then name for determinism
885+
scores.sort(
886+
key=lambda score: (
887+
score.new_packages,
888+
-score.coverage_percentage,
889+
score.collection,
890+
)
891+
)
892+
best_score = scores[0] if scores else None
893+
894+
logger.debug(
895+
"%s: %d deps, best fit '%s' (%d new, %.1f%% coverage)",
896+
node.canonicalized_name,
897+
total_dependency_count,
898+
best_score.collection if best_score else "none",
899+
best_score.new_packages if best_score else 0,
900+
best_score.coverage_percentage if best_score else 0.0,
901+
)
902+
903+
results.append(
904+
{
905+
"package": str(node.canonicalized_name),
906+
"version": str(node.version),
907+
"total_dependencies": total_dependency_count,
908+
"best_fit": best_score.collection if best_score else "none",
909+
"new_packages": best_score.new_packages if best_score else 0,
910+
"existing_packages": best_score.existing_packages if best_score else 0,
911+
"coverage_percentage": (
912+
round(best_score.coverage_percentage, 1) if best_score else 0.0
913+
),
914+
"all_collections": [
915+
{
916+
"collection": score.collection,
917+
"new_packages": score.new_packages,
918+
"existing_packages": score.existing_packages,
919+
"coverage_percentage": round(score.coverage_percentage, 1),
920+
}
921+
for score in scores
922+
],
923+
}
924+
)
925+
926+
return results
927+
928+
929+
def _print_suggest_collection_table(
930+
results: list[dict[str, typing.Any]],
931+
) -> None:
932+
"""Render suggest-collection results as a Rich table."""
933+
table = Table(
934+
title="Collection Suggestions for Onboarding Packages",
935+
box=rich.box.MARKDOWN,
936+
title_justify="left",
937+
)
938+
table.add_column("Package", justify="left", no_wrap=True)
939+
table.add_column("Version", justify="left", no_wrap=True)
940+
table.add_column("Total Deps", justify="right", no_wrap=True)
941+
table.add_column("Best Fit", justify="left", no_wrap=True)
942+
table.add_column("New Pkgs", justify="right", no_wrap=True)
943+
table.add_column("Existing", justify="right", no_wrap=True)
944+
table.add_column("Coverage", justify="right", no_wrap=True)
945+
946+
for result in results:
947+
table.add_row(
948+
result["package"],
949+
result["version"],
950+
str(result["total_dependencies"]),
951+
result["best_fit"],
952+
str(result["new_packages"]),
953+
str(result["existing_packages"]),
954+
f"{result['coverage_percentage']:.1f}%",
955+
)
956+
957+
rich.get_console().print(table)
958+
959+
960+
@graph.command(name="suggest-collection")
961+
@click.option(
962+
"--format",
963+
"output_format",
964+
type=click.Choice(["table", "json"], case_sensitive=False),
965+
default="table",
966+
help="Output format (default: table)",
967+
)
968+
@click.argument("onboarding-graph", type=str)
969+
@click.argument("collection-graphs", nargs=-1, required=True, type=str)
970+
def suggest_collection(
971+
output_format: str,
972+
onboarding_graph: str,
973+
collection_graphs: tuple[str, ...],
974+
) -> None:
975+
"""Suggest the best-fit collection for each onboarding package.
976+
977+
Analyzes dependency overlap between top-level packages in ONBOARDING_GRAPH
978+
and the existing COLLECTION_GRAPHS to recommend where each onboarding
979+
package should be placed.
980+
981+
For each top-level package in the onboarding graph, computes the full
982+
transitive dependency closure and compares it against every collection.
983+
Collections are ranked by fewest new packages required, then by highest
984+
dependency coverage.
985+
986+
\b
987+
ONBOARDING_GRAPH Path to the onboarding collection graph.json.
988+
COLLECTION_GRAPHS One or more paths to existing collection graph.json files.
989+
"""
990+
try:
991+
onboarding = DependencyGraph.from_file(onboarding_graph)
992+
except Exception as err:
993+
raise click.ClickException(
994+
f"Failed to load onboarding graph {onboarding_graph}: {err}"
995+
) from err
996+
997+
root = onboarding.get_root_node()
998+
999+
toplevel_nodes: list[DependencyNode] = [
1000+
edge.destination_node
1001+
for edge in root.children
1002+
if edge.req_type == RequirementType.TOP_LEVEL
1003+
]
1004+
1005+
if not toplevel_nodes:
1006+
click.echo("No top-level packages found in onboarding graph.", err=True)
1007+
1008+
logger.info(
1009+
"Loaded onboarding graph with %d top-level packages", len(toplevel_nodes)
1010+
)
1011+
1012+
collection_packages: dict[str, set[NormalizedName]] = {}
1013+
for graph_path in collection_graphs:
1014+
collection_name = extract_collection_name(graph_path)
1015+
if collection_name in collection_packages:
1016+
raise click.ClickException(
1017+
f"Duplicate collection name '{collection_name}' from {graph_path}. "
1018+
"Rename one of the graph files to avoid ambiguity."
1019+
)
1020+
try:
1021+
collection_graph = DependencyGraph.from_file(graph_path)
1022+
except Exception as err:
1023+
raise click.ClickException(
1024+
f"Failed to load collection graph {graph_path}: {err}"
1025+
) from err
1026+
collection_packages[collection_name] = get_package_names(collection_graph)
1027+
logger.debug(
1028+
"Collection '%s': %d packages",
1029+
collection_name,
1030+
len(collection_packages[collection_name]),
1031+
)
1032+
1033+
results = _analyze_suggestions(toplevel_nodes, collection_packages)
1034+
1035+
if output_format == "json":
1036+
click.echo(json.dumps(results, indent=2))
1037+
else:
1038+
_print_suggest_collection_table(results)

src/fromager/dependency_graph.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,24 @@ def iter_build_requirements(self) -> typing.Iterable[DependencyNode]:
158158
):
159159
yield install_edge.destination_node
160160

161+
def iter_all_dependencies(self) -> typing.Iterable[DependencyNode]:
162+
"""Get all unique, recursive dependencies following every edge type.
163+
164+
Yields every reachable node exactly once using iterative DFS.
165+
Follows install, build, and toplevel edges.
166+
"""
167+
visited: set[str] = {self.key}
168+
stack: list[DependencyNode] = [self]
169+
while stack:
170+
current = stack.pop()
171+
for edge in current.children:
172+
child_node = edge.destination_node
173+
if child_node.key in visited:
174+
continue
175+
visited.add(child_node.key)
176+
yield child_node
177+
stack.append(child_node)
178+
161179
def iter_install_requirements(self) -> typing.Iterable[DependencyNode]:
162180
"""Get all unique, recursive install requirements"""
163181
visited: set[str] = set()

0 commit comments

Comments
 (0)