diff --git a/cyclonedx_py/_internal/utils/cdx.py b/cyclonedx_py/_internal/utils/cdx.py index 3e331015c..1f9a24f7c 100644 --- a/cyclonedx_py/_internal/utils/cdx.py +++ b/cyclonedx_py/_internal/utils/cdx.py @@ -35,6 +35,7 @@ from cyclonedx.model.license import DisjunctiveLicense, License, LicenseAcknowledgement, LicenseExpression from ... import __version__ as _THIS_VERSION # noqa:N812 +from .url_classifiers import _MAP_KNOWN_URL_LABELS, _MAP_URL_LABEL_PREFIXES def make_bom(**kwargs: Any) -> Bom: @@ -119,32 +120,17 @@ def licenses_fixup(component: 'Component') -> None: component.evidence.licenses.update(licenses) -_MAP_KNOWN_URL_LABELS: dict[str, ExternalReferenceType] = { - # see https://peps.python.org/pep-0345/#project-url-multiple-use - # see https://github.com/pypi/warehouse/issues/5947#issuecomment-699660629 - 'bugtracker': ExternalReferenceType.ISSUE_TRACKER, - 'issuetracker': ExternalReferenceType.ISSUE_TRACKER, - 'issues': ExternalReferenceType.ISSUE_TRACKER, - 'bugreports': ExternalReferenceType.ISSUE_TRACKER, - 'tracker': ExternalReferenceType.ISSUE_TRACKER, - 'home': ExternalReferenceType.WEBSITE, - 'homepage': ExternalReferenceType.WEBSITE, - 'download': ExternalReferenceType.DISTRIBUTION, - 'documentation': ExternalReferenceType.DOCUMENTATION, - 'docs': ExternalReferenceType.DOCUMENTATION, - 'changelog': ExternalReferenceType.RELEASE_NOTES, - 'changes': ExternalReferenceType.RELEASE_NOTES, - # 'source': ExternalReferenceType.SOURCE-DISTRIBUTION, - 'repository': ExternalReferenceType.VCS, - 'github': ExternalReferenceType.VCS, - 'chat': ExternalReferenceType.CHAT, -} - _NOCHAR_MATCHER = re_compile('[^a-z]') -def url_label_to_ert(value: str) -> ExternalReferenceType: - return _MAP_KNOWN_URL_LABELS.get( - _NOCHAR_MATCHER.sub('', str(value).lower()), - ExternalReferenceType.OTHER - ) +def url_label_to_ert(label: str, url: Optional[str] = None) -> ExternalReferenceType: + norm = _NOCHAR_MATCHER.sub('', str(label).lower()) + # 1. exact label + ert = _MAP_KNOWN_URL_LABELS.get(norm) + if ert is not None: + return ert + # 2. label prefix + for prefix, pert in _MAP_URL_LABEL_PREFIXES: + if norm.startswith(prefix): + return pert + return ExternalReferenceType.OTHER diff --git a/cyclonedx_py/_internal/utils/url_classifiers.py b/cyclonedx_py/_internal/utils/url_classifiers.py new file mode 100644 index 000000000..cc487b153 --- /dev/null +++ b/cyclonedx_py/_internal/utils/url_classifiers.py @@ -0,0 +1,76 @@ +# This file is part of CycloneDX Python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) OWASP Foundation. All Rights Reserved. + + +""" +Pure mapping data for URL -> ExternalReferenceType classification. + +This module is DATA ONLY -- no logic. To extend classification, add rows here. +Four match styles, applied by ``cdx.url_label_to_ert`` in this precedence order: + +1. _MAP_KNOWN_URL_LABELS exact label (normalized: lowercased, non-[a-z] stripped) +2. _MAP_URL_LABEL_PREFIXES label prefix (PyPI '*' semantics); first match wins +3. _MAP_KNOWN_URL_HOST_SUFFIXES host == key OR host endswith '.'+key (domain + subdomains) +4. _MAP_KNOWN_URL_HOST_PREFIXES host == key OR host startswith key+'.' (e.g. docs.*) + +Label keys MUST already be normalized (lowercase, only [a-z]). +Host keys MUST be lowercase. + +see https://docs.pypi.org/project_metadata/#icons +""" + +from cyclonedx.model import ExternalReferenceType + +# 1. exact label -> ERT +_MAP_KNOWN_URL_LABELS: dict[str, ExternalReferenceType] = { + # see https://peps.python.org/pep-0345/#project-url-multiple-use + # see https://github.com/pypi/warehouse/issues/5947#issuecomment-699660629 + 'bugtracker': ExternalReferenceType.ISSUE_TRACKER, + 'issuetracker': ExternalReferenceType.ISSUE_TRACKER, + 'issues': ExternalReferenceType.ISSUE_TRACKER, + 'bugreports': ExternalReferenceType.ISSUE_TRACKER, + 'tracker': ExternalReferenceType.ISSUE_TRACKER, + 'home': ExternalReferenceType.WEBSITE, + 'homepage': ExternalReferenceType.WEBSITE, + 'download': ExternalReferenceType.DISTRIBUTION, + 'documentation': ExternalReferenceType.DOCUMENTATION, + 'docs': ExternalReferenceType.DOCUMENTATION, + 'changelog': ExternalReferenceType.RELEASE_NOTES, + 'changes': ExternalReferenceType.RELEASE_NOTES, + 'releasenotes': ExternalReferenceType.RELEASE_NOTES, + 'news': ExternalReferenceType.RELEASE_NOTES, + 'whatsnew': ExternalReferenceType.RELEASE_NOTES, + 'history': ExternalReferenceType.RELEASE_NOTES, + 'repository': ExternalReferenceType.VCS, + 'source': ExternalReferenceType.VCS, + 'github': ExternalReferenceType.VCS, + 'chat': ExternalReferenceType.CHAT, +} + +# 2. label prefix -> ERT (ordered; first match wins). normalized prefixes. +_MAP_URL_LABEL_PREFIXES: tuple[tuple[str, ExternalReferenceType], ...] = ( + ('documentation', ExternalReferenceType.DOCUMENTATION), + ('docs', ExternalReferenceType.DOCUMENTATION), + ('bug', ExternalReferenceType.ISSUE_TRACKER), + ('issue', ExternalReferenceType.ISSUE_TRACKER), + ('tracker', ExternalReferenceType.ISSUE_TRACKER), + ('report', ExternalReferenceType.ISSUE_TRACKER), + ('funding', ExternalReferenceType.OTHER), + ('sponsor', ExternalReferenceType.OTHER), + ('donation', ExternalReferenceType.OTHER), + ('donate', ExternalReferenceType.OTHER), +) diff --git a/tests/unit/test_utils_cdx.py b/tests/unit/test_utils_cdx.py index 8c7471bdd..ee9475105 100644 --- a/tests/unit/test_utils_cdx.py +++ b/tests/unit/test_utils_cdx.py @@ -23,8 +23,9 @@ from cyclonedx.model import ExternalReference, ExternalReferenceType from cyclonedx.model.component import Component, ComponentType from cyclonedx.model.license import License, LicenseAcknowledgement +from ddt import data, ddt, unpack -from cyclonedx_py._internal.utils.cdx import make_bom +from cyclonedx_py._internal.utils.cdx import make_bom, url_label_to_ert from tests import EXPECTED_TOOL_NAME, load_pyproject @@ -79,3 +80,41 @@ def test_extrefs(self) -> None: c = self.__get_c_by_name(EXPECTED_TOOL_NAME) ers: tuple[ExternalReference, ...] = tuple(c.external_references) self.assertExtRefs(p, ers) + + +@ddt +class TestUrlLabelToErt(TestCase): + + @data( + # exact labels (existing behaviour preserved) + ('Homepage', ExternalReferenceType.WEBSITE), + ('Home', ExternalReferenceType.WEBSITE), + ('Download', ExternalReferenceType.DISTRIBUTION), + ('Changelog', ExternalReferenceType.RELEASE_NOTES), + ('Change log', ExternalReferenceType.RELEASE_NOTES), + ('Release notes', ExternalReferenceType.RELEASE_NOTES), + ("What's new", ExternalReferenceType.RELEASE_NOTES), + ('History', ExternalReferenceType.RELEASE_NOTES), + ('Repository', ExternalReferenceType.VCS), + ('Source', ExternalReferenceType.VCS), + ('Chat', ExternalReferenceType.CHAT), + # prefix labels (PyPI '*' semantics) + ('Documentation', ExternalReferenceType.DOCUMENTATION), + ('Documentation for users', ExternalReferenceType.DOCUMENTATION), + ('Docs (latest)', ExternalReferenceType.DOCUMENTATION), + ('Bug Reports', ExternalReferenceType.ISSUE_TRACKER), + ('Issue Tracker', ExternalReferenceType.ISSUE_TRACKER), + ('Tracker', ExternalReferenceType.ISSUE_TRACKER), + ('Report a bug', ExternalReferenceType.ISSUE_TRACKER), + ('Funding', ExternalReferenceType.OTHER), + ('Sponsor this project', ExternalReferenceType.OTHER), + ('Donate', ExternalReferenceType.OTHER), + # unknown -> OTHER + ('Some Random Label', ExternalReferenceType.OTHER), + ) + @unpack + def test_label_only(self, label: str, expected: ExternalReferenceType) -> None: + self.assertIs(expected, url_label_to_ert(label)) + + def test_label_only_url_none_backcompat(self) -> None: + self.assertIs(ExternalReferenceType.WEBSITE, url_label_to_ert('Homepage', None))