From df706bff012e5bfa06ee1ad1288cc7fef900536d Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 8 Sep 2023 13:35:19 -0400 Subject: [PATCH 01/24] Fix to get_schema(s) to use URL prefixed with slash for vapp. --- dcicutils/ff_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index 280bdc0df..cad9f4028 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -990,7 +990,7 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona base_url = f"profiles/{to_camel_case(name)}.json" add_on = 'frame=raw' if portal_vapp: - full_url = f"{base_url}?{add_on}" + full_url = f"/{base_url}?{add_on}" res = portal_vapp.get(full_url) return get_response_json(res) else: @@ -1022,7 +1022,7 @@ def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool base_url = 'profiles/' add_on = 'frame=raw' if portal_vapp: - full_url = f"{base_url}?{add_on}" + full_url = f"/{base_url}?{add_on}" schemas: Dict[str, Dict] = portal_vapp.get(full_url) else: schemas: Dict[str, Dict] = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on) @@ -1488,6 +1488,7 @@ def get_response_json(res): it is not present. Used with the metadata functions. """ try: + # TODO: Fix for res being from vapp (webtest.response.TestRespons) call, using MockResponse ... res_json = res.json() except Exception: raise Exception('Cannot get json for request to %s. Status' From aaba4986445915f636dd256ba7697c26de840bc5 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 8 Sep 2023 13:47:56 -0400 Subject: [PATCH 02/24] Updated version and CHANGELOG --- CHANGELOG.rst | 9 +++++++++ dcicutils/ff_utils.py | 2 +- pyproject.toml | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a92be15a1..345c0a1d1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,15 @@ Change Log ---------- +7.11.0 +====== + +* In ``ff_utils``: + + * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` case needing a leading slash on the URL. + * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` returning webtest.response.TestResponse + which has a ``json`` object property rather than a function. + 7.10.0 ====== diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index cad9f4028..d6ca184c6 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -1488,7 +1488,7 @@ def get_response_json(res): it is not present. Used with the metadata functions. """ try: - # TODO: Fix for res being from vapp (webtest.response.TestRespons) call, using MockResponse ... + # TODO: Fix for res being from vapp (webtest.response.TestResponse) call, using MockResponse ... res_json = res.json() except Exception: raise Exception('Cannot get json for request to %s. Status' diff --git a/pyproject.toml b/pyproject.toml index 1078a57ea..65dba0353 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.10.0" +version = "7.11.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From d89319263aa8fe6e01f223676fcb88513cd609cc Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 8 Sep 2023 15:31:58 -0400 Subject: [PATCH 03/24] Fix ff_utils.get_response_json to handle webtest.response.TestResponse. --- dcicutils/ff_utils.py | 8 +++++--- dcicutils/misc_utils.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index d6ca184c6..6f011bea4 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -17,7 +17,7 @@ # S3BucketName, S3KeyName, ) from .lang_utils import disjoined_list -from .misc_utils import PRINT, to_camel_case, remove_suffix, VirtualApp +from .misc_utils import PRINT, to_camel_case, remove_suffix, VirtualApp, VirtualAppResponse # TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP. @@ -1488,8 +1488,10 @@ def get_response_json(res): it is not present. Used with the metadata functions. """ try: - # TODO: Fix for res being from vapp (webtest.response.TestResponse) call, using MockResponse ... - res_json = res.json() + if isinstance(res, VirtualAppResponse): + res_json = res.json + else: + res_json = res.json() except Exception: raise Exception('Cannot get json for request to %s. Status' ' code: %s. Response text: %s' % diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 88c228c7f..748d79a6d 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -295,6 +295,7 @@ def app(self): """ return self.wrapped_app.app +VirtualAppResponse = webtest.response.TestResponse def exported(*variables): """ From 10b6aac820cf998a2c43a4f6521fec888ff762bd Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 8 Sep 2023 15:39:30 -0400 Subject: [PATCH 04/24] flake8 fix --- dcicutils/misc_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 748d79a6d..cc18f4b19 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -295,8 +295,10 @@ def app(self): """ return self.wrapped_app.app + VirtualAppResponse = webtest.response.TestResponse + def exported(*variables): """ This function does nothing but is used for declaration purposes. From f90217350ca5a52270a59eaecf92f6419249b264 Mon Sep 17 00:00:00 2001 From: David Michaels Date: Fri, 8 Sep 2023 15:54:48 -0400 Subject: [PATCH 05/24] test fix --- test/test_ff_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 16413a519..b221ae693 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1366,7 +1366,7 @@ def test_get_schema_with_vapp(): mock_get_authentication_with_server.assert_not_called() mock_get_metadata.assert_not_called() - sample_vapp.get.assert_called_once_with('profiles/User.json?frame=raw') + sample_vapp.get.assert_called_once_with('/profiles/User.json?frame=raw') @pytest.mark.unit @@ -1418,7 +1418,7 @@ def test_get_schemas_with_vapp(): mock_get_authentication_with_server.assert_not_called() mock_get_metadata.assert_not_called() - sample_vapp.get.assert_called_once_with('profiles/?frame=raw') + sample_vapp.get.assert_called_once_with('/profiles/?frame=raw') def test_get_schemas_options(): From 8931a89de1ff81bebdddb3f9c2b262812945650b Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 13 Sep 2023 19:35:16 -0400 Subject: [PATCH 06/24] first pass at a run-license-checker script. --- CHANGELOG.rst | 32 ++ dcicutils/license_utils.py | 488 +++++++++++++++++------ dcicutils/scripts/run_license_checker.py | 63 +++ pyproject.toml | 4 +- test/test_license_utils.py | 22 +- 5 files changed, 471 insertions(+), 138 deletions(-) create mode 100644 dcicutils/scripts/run_license_checker.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 345c0a1d1..99d4da9ce 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,38 @@ Change Log ---------- +7.12.0 +====== + +* In ``license_utils``: + + * Add an ``RLanguageFramework``. + + * Add various additional checker classes, and a registry to catalog them. Refactor so that pre-existing + classes better share information in an inherited way. + + +------------------------------------------+--------------------------------+----------------+ + | Class | Checker Name | Status | + +==========================================+================================+================+ + | ``ParkLabCommonLicenseChecker`` | ``park-lab-common`` | New | + +------------------------------------------+--------------------------------+----------------+ + | ``ParkLabGplPipelineLicenseChecker`` | ``park-lab-gpl-pipeline`` | New | + +------------------------------------------+--------------------------------+----------------+ + | ``ParkLabCommonServerLicenseChecker`` | ``park-lab-common-server`` | New | + +------------------------------------------+--------------------------------+----------------+ + | ``C4InfrastructureLicenseChecker`` | ``c4-infastructure`` | Refactored | + +------------------------------------------+--------------------------------+----------------+ + | ``C4PythonInfrastructureLicenseChecker`` | ``c4-python-infrastructure`` | Refactored | + +------------------------------------------+--------------------------------+----------------+ + | ``Scan2PipelineLicenseChecker`` | ``scan2-pipeline`` | New | + +------------------------------------------+--------------------------------+----------------+ + +* In ``scripts``: + + * Add a ``run-license-checker`` script, implemented by ``run_license_checker.py``, + that runs the license checker whose "checker name" is given as an argument. + + 7.11.0 ====== diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index db18fd7df..8f6b9cefb 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -1,4 +1,5 @@ import contextlib +import csv import datetime import io import json @@ -23,13 +24,15 @@ # import piplicenses from collections import defaultdict -from typing import Any, Dict, DefaultDict, List, Optional, Type, Union +from typing import Any, Dict, DefaultDict, List, Optional, Type, TypeVar, Union # For obscure reasons related to how this file is used for early prototyping, these must use absolute references # to modules, not relative references. Later when things are better installed, we can make refs relative again. +from dcicutils.exceptions import InvalidParameterError from dcicutils.lang_utils import there_are -from dcicutils.misc_utils import PRINT, get_error_message, local_attrs +from dcicutils.misc_utils import PRINT, get_error_message, local_attrs, ignored +T = TypeVar("T") # logging.basicConfig() # logger = logging.getLogger(__name__) @@ -43,6 +46,10 @@ _STATUS = 'status' +def augment(d: dict, by: dict): + return dict(d, **by) + + class LicenseStatus: ALLOWED = "ALLOWED" SPECIALLY_ALLOWED = "SPECIALLY_ALLOWED" @@ -87,13 +94,13 @@ def temporary_registration_for_testing(cls): yield @classmethod - def register(cls, *, name): + def register_framework(cls, *, name): """ Declares a python license framework classs. Mostly these names will be language names like 'python' or 'javascript', but they might be names of other, non-linguistic frameworks (like 'cgap-pipeline', for example). """ - def _decorator(framework_class): + def _decorator(framework_class: T) -> T: if not issubclass(framework_class, LicenseFramework): raise ValueError(f"The class {framework_class.__name__} does not inherit from LicenseFramework.") framework_class.NAME = name @@ -117,11 +124,12 @@ def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) -@LicenseFrameworkRegistry.register(name='javascript') +@LicenseFrameworkRegistry.register_framework(name='javascript') class JavascriptLicenseFramework(LicenseFramework): @classmethod - def implicated_licenses(cls, *, licenses_spec: str): + def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: + ignored(package_name) # We only care which licenses were mentioned, not what algebra is used on them. # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". @@ -150,7 +158,7 @@ def get_dependencies(cls): for name, record in records.items(): licenses_spec = record.get(_LICENSES) if '(' in licenses_spec: - licenses = cls.implicated_licenses(licenses_spec=licenses_spec) + licenses = cls.implicated_licenses(package_name=name, licenses_spec=licenses_spec) PRINT(f"Rewriting {licenses_spec!r} as {licenses!r}") elif licenses_spec: licenses = [licenses_spec] @@ -158,13 +166,14 @@ def get_dependencies(cls): licenses = [] entry = { _NAME: name.lstrip('@').split('@')[0], # e.g., @foo/bar@3.7 - _LICENSES: licenses # TODO: could parse this better. + _LICENSES: licenses, # TODO: could parse this better. + _FRAMEWORK: 'javascript' } result.append(entry) return result -@LicenseFrameworkRegistry.register(name='python') +@LicenseFrameworkRegistry.register_framework(name='python') class PythonLicenseFramework(LicenseFramework): @classmethod @@ -184,12 +193,100 @@ def get_dependencies(cls): entry = { _NAME: license_name, _LICENSES: licenses, - _LANGUAGE: 'python', + _FRAMEWORK: 'python', } result.append(entry) return sorted(result, key=lambda x: x.get(_NAME).lower()) +@LicenseFrameworkRegistry.register_framework(name='r') +class RLicenseFramework(LicenseFramework): + + VERBOSE = False + + R_PART_SPEC = re.compile("^Part of R [0-9.]+$") + # This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' + # It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. + # It will also correctly handle the unlikely case of ' (= 3 or greater)' + # or will + VERSION_SPEC = re.compile('( [(]([>]?)(?:[=]|version) ([0-9.]+)((?: or (?:greater|later))?)[)])') + GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') + R_LANGUAGE_LICENSE_NAME = 'R-language-license' + + @classmethod + def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: + if cls.R_PART_SPEC.match(licenses_spec): + return [cls.R_LANGUAGE_LICENSE_NAME] + m = cls.GPL_VERSION_CHOICE.match(licenses_spec) + if m: + version_a, version_b = m.groups() + return [f"GPL-{version_a}-or-{version_b}"] + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + n = 0 + original_licenses_spec = licenses_spec + while n < 1000: # just in case of an infinite loop + n += 1 + m = cls.VERSION_SPEC.search(licenses_spec) + if not m: + break + matched, greater, version_spec, greater2 = m.groups() + greater = greater or greater2 + licenses_spec = licenses_spec.replace(matched, + f"-{version_spec}" + f"{'+' if greater else ''}") + if licenses_spec != original_licenses_spec: + PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + licenses = sorted(map(lambda x: x.strip(), + (licenses_spec + .replace('|', ',') + .replace('file ', f'Custom: {package_name} file ') + ).split(','))) + return licenses + + @classmethod + def get_dependencies(cls): + + _PACKAGE = "Package" + _LICENSE = "License" + + found_problems = 0 + + output_bytes = subprocess.check_output(['r', '--no-echo', '-q', '-e', + f'write.csv(installed.packages()[,c("Package", "License")])'], + # This will output to stderr if there's an error, + # but it will still put {} on stdout, which is good enough for us. + stderr=subprocess.DEVNULL) + output = output_bytes.decode('utf-8') + result = [] + first_line = True + for entry in csv.reader(io.StringIO(output)): # [ignore, package, license] + if first_line: + first_line = False + if entry == ["", _PACKAGE, _LICENSE]: # we expect headers + continue + try: + package_name = entry[1] + licenses_spec = entry[2] + licenses = cls.implicated_licenses(package_name=package_name, licenses_spec=licenses_spec) + entry = { + _NAME: package_name, + _LICENSES: licenses, + _FRAMEWORK: 'r', + } + result.append(entry) + except Exception as e: + found_problems += 1 + if cls.VERBOSE: + PRINT(get_error_message(e)) + if found_problems > 0: + warnings.warn(there_are(found_problems, kind="problem", show=False, punctuate=True, tense='past')) + return sorted(result, key=lambda x: x.get(_NAME).lower()) + + class LicenseFileParser: VERBOSE = False @@ -316,7 +413,7 @@ class LicenseChecker: Note that if you don't like these license names, which are admittedly non-standard and do nt seem to use SPDX naming conventions, you can customize the get_dependencies method to return a different list, one of the form - [{"name": "libname", "license_classifier": ["license1", "license2", ...], "language": "python"}] + [{"name": "libname", "license_classifier": ["license1", "license2", ...], "framework": "python"}] by whatever means you like and using whatever names you like. """ @@ -499,6 +596,30 @@ class MyOrgLicenseChecker(LicenseChecker): raise LicenseAcceptabilityCheckFailure(unacceptable_licenses=analysis.unacceptable) +class LicenseCheckerRegistry: + + REGISTRY: Dict[str, Type[LicenseChecker]] = {} + + @classmethod + def register_checker(cls, name: str): + def _register(license_checker_class: Type[LicenseChecker]): + cls.REGISTRY[name] = license_checker_class + return license_checker_class + return _register + + @classmethod + def lookup_checker(cls, name: str) -> Type[LicenseChecker]: + result: Optional[Type[LicenseChecker]] = cls.REGISTRY.get(name) + if result is None: + raise InvalidParameterError(parameter='checker_name', value=name, + options=cls.all_checker_names()) + return result + + @classmethod + def all_checker_names(cls): + return list(cls.REGISTRY.keys()) + + class LicenseCheckFailure(Exception): DEFAULT_MESSAGE = "License check failure." @@ -523,16 +644,13 @@ def __init__(self, message=None, unacceptable_licenses=None): super().__init__(message=message) -class C4InfrastructureLicenseChecker(LicenseChecker): +@LicenseCheckerRegistry.register_checker('park-lab-common') +class ParkLabCommonLicenseChecker(LicenseChecker): """ - This set of values is useful to us in Park Lab where these tools were developed. - If you're at some other organization, we recommend you make a class that has values - suitable to your own organizational needs. + Minimal checker common to all tech from Park Lab. """ COPYRIGHT_OWNER = "President and Fellows of Harvard College" - LICENSE_TITLE = "(The )?MIT License" - LICENSE_FRAMEWORKS = ['python', 'javascript'] ALLOWED = [ @@ -583,6 +701,13 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'Eclipse Public License', 'EPL-2.0', + # The FSF Unlimited License (FSFUL) seems to be a completely permissive license. + # Refs: + # * https://spdx.org/licenses/FSFUL.html + # * https://fedoraproject.org/wiki/Licensing/FSF_Unlimited_License + 'FSF Unlimited License', + 'FSFUL', + # Linking = Yes, Cat = Permissive Software Licenses # Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer 'Historical Permission Notice and Disclaimer (HPND)', @@ -626,6 +751,16 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'The Unlicense (Unlicense)', 'Unlicense', + # Various licenses seem to call themselves or be summed up as unlimited. + # So far we know of none that are not highly permissive. + # * boot and KernSmooth are reported by R as being 'Unlimited' + # Refs: + # * https://cran.r-project.org/web/packages/KernSmooth/index.html + # (https://github.com/cran/KernSmooth/blob/master/LICENCE.note) + # * https://cran.r-project.org/package=boot + # (https://github.com/cran/boot/blob/master/DESCRIPTION) + 'Unlimited', + # Linking = Permissive, Private Use = ? # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'W3C License', @@ -646,6 +781,29 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'Zope Public License', ] + EXCEPTIONS = { + + # DFSG = Debian Free Software Guidelines + # Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines + # Used as an apparent modifier to other licenses, to say they are approved per Debian. + # For example in this case, pytest-timeout has license: DFSG approved, MIT License, + # but is really just an MIT License that someone has checked is DFSG approved. + 'DFSG approved': [ + 'pytest-timeout', # MIT Licensed + ], + + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ + 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed + ], + + 'GNU General Public License (GPL)': [ + 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs + ], + + } + EXPECTED_MISSING_LICENSES = [ # This is a name we use for our C4 portals. And it isn't published. @@ -726,7 +884,7 @@ class C4InfrastructureLicenseChecker(LicenseChecker): 'responses', # This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault - # calls itself internally.. In any case, it's under MIT license and OK. + # calls itself internally. In any case, it's under MIT license and OK. # Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt 'snovault', @@ -757,141 +915,215 @@ class C4InfrastructureLicenseChecker(LicenseChecker): ] - EXCEPTIONS = { - - 'BSD*': [ - # Although modified to insert the author name into the license text itself, - # the license for these libraries are essentially BSD-3-Clause. - 'formatio', - 'samsam', - # There are some slightly different versions of what appear to be BSD licenses here, - # but clearly the license is permissive. - # Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme - 'mutation-observer', - ], - - 'Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global': [ - # The use of this URL appears to be a syntax error in the definition of entries-ponyfill - # In fact this seems to be covered by a CC0-1.0 license. - # Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE - 'object.entries-ponyfill', - ], - - 'Custom: https://github.com/saikocat/colorbrewer.': [ - # The use of this URL appears to be a syntax error in the definition of cartocolor - # In fact, this seems to be covered by a CC-BY-3.0 license. - # Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme - 'cartocolor', - ], - - 'Custom: https://travis-ci.org/component/emitter.png': [ - # The use of this png appears to be a syntax error in the definition of emitter-component. - # In fact, emitter-component uses an MIT License - # Ref: https://www.npmjs.com/package/emitter-component - # Ref: https://github.com/component/emitter/blob/master/LICENSE - 'emitter-component', - ], +@LicenseCheckerRegistry.register_checker('park-lab-gpl-pipeline') +class ParkLabGplPipelineLicenseChecker(ParkLabCommonLicenseChecker): + """ + Minimal checker common to GPL pipelines from Park Lab. + """ - # The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) - # seems to lack a license, but appears to be forked from the jsts library that uses - # the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive - # license is intended. - 'Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg': [ - 'turf-jsts' - ], + LICENSE_FRAMEWORKS = ['python', 'r'] # TODO: Implement 'conda' and add it here. - # DFSG = Debian Free Software Guidelines - # Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines - # Used as an apparent modifier to other licenses, to say they are approved per Debian. - # For example in this case, pytest-timeout has license: DFSG approved, MIT License, - # but is really just an MIT License that someone has checked is DFSG approved. - 'DFSG approved': [ - 'pytest-timeout', # MIT Licensed - ], + ALLOWED = ParkLabCommonLicenseChecker.ALLOWED + [ # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ - 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed - ], + 'GNU Lesser General Public License v2 or later (LGPLv2+)', + 'LGPL-v2', 'LGPL-v2.0', 'LGPL-2', 'LGPL-2.0', + 'LGPL-v2+', 'LGPL-v2.0+', 'LGPL-2+', 'LGPL-2.0+', # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ - 'pytest-redis', # used only privately in testing, not used in server code, not modified, not distributed - 'mirakuru', # required by pytest-redis (used only where it's used) - ], + 'GNU Lesser General Public License v3 or later (LGPLv3+)', + 'LGPL-v3', 'LGPL-v3.0', 'LGPL-3', 'LGPL-3.0', + 'LGPL-v3+', 'LGPL-v3.0+', 'LGPL-3+', 'LGPL-3.0+', - 'GNU General Public License (GPL)': [ - 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs - ], + # Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK + # for pipeline or server use as long as we're not distributing sources. + 'LGPL', + 'GNU Library or Lesser General Public License (LGPL)', - # Linking = With Restrictions, Private Use = Yes + # Linking = "GPLv3 compatible only", Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - # 'GNU Lesser General Public License v3 or later (LGPLv3+)', + 'GPL-2-or-3', + 'GPL-v2+', 'GPL-v2.0+', 'GPL-2+', 'GPL-2.0+', + 'GPL-v3', 'GPL-v3.0', 'GPL-3', 'GPL-3.0', + 'GPL-v3+', 'GPL-v3.0+', 'GPL-3+', 'GPL-3.0+', - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Library or Lesser General Public License (LGPL)': [ - 'psycopg2', # Used at runtime during server operation, but not modified or distributed - 'psycopg2-binary', # Used at runtime during server operation, but not modified or distributed - 'chardet', # Potentially used downstream in loadxl to detect charset for text files - 'pyzmq', # Used in post-deploy-perf-tests, not distributed, and not modified or distributed - ], + # Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. + # And version 3 is our preferred interpretation. + 'GNU General Public License', + 'GPL', - 'GPL-2.0': [ - # The license file for the node-forge javascript library says: - # - # "You may use the Forge project under the terms of either the BSD License or the - # GNU General Public License (GPL) Version 2." - # - # (We choose to use it under the BSD license.) - # Ref: https://www.npmjs.com/package/node-forge?activeTab=code - 'node-forge', - ], + RLicenseFramework.R_LANGUAGE_LICENSE_NAME - 'MIT*': [ + ] - # This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. - # (It also mentions that some tools for building/testing use other libraries.) - # Ref: https://github.com/requirejs/domReady/blob/master/LICENSE - 'domready', - # This library is under 'COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1' - # Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt - # About CDDL ... - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'jsonp', +@LicenseCheckerRegistry.register_checker('park-lab-common-server') +class ParkLabCommonServerLicenseChecker(ParkLabCommonLicenseChecker): + """ + Checker for servers from Park Lab. - # This library says pretty clearly it intends MIT license. - # Ref: https://www.npmjs.com/package/component-indexof - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'component-indexof', + If you're at some other organization, we recommend you make a class that has values + suitable to your own organizational needs. + """ - # These look like a pretty straight MIT license. - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'mixin', # LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code - 'stack-trace', # https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE - 'typed-function', # LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code + LICENSE_FRAMEWORKS = ['python', 'javascript'] - ], + EXCEPTIONS = augment( + ParkLabCommonLicenseChecker.EXCEPTIONS, + by={ + 'BSD*': [ + # Although modified to insert the author name into the license text itself, + # the license for these libraries are essentially BSD-3-Clause. + 'formatio', + 'samsam', + + # There are some slightly different versions of what appear to be BSD licenses here, + # but clearly the license is permissive. + # Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme + 'mutation-observer', + ], + + 'Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global': [ + # The use of this URL appears to be a syntax error in the definition of entries-ponyfill + # In fact this seems to be covered by a CC0-1.0 license. + # Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE + 'object.entries-ponyfill', + ], + + 'Custom: https://github.com/saikocat/colorbrewer.': [ + # The use of this URL appears to be a syntax error in the definition of cartocolor + # In fact, this seems to be covered by a CC-BY-3.0 license. + # Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme + 'cartocolor', + ], + + 'Custom: https://travis-ci.org/component/emitter.png': [ + # The use of this png appears to be a syntax error in the definition of emitter-component. + # In fact, emitter-component uses an MIT License + # Ref: https://www.npmjs.com/package/emitter-component + # Ref: https://github.com/component/emitter/blob/master/LICENSE + 'emitter-component', + ], + + # The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) + # seems to lack a license, but appears to be forked from the jsts library that uses + # the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive + # license is intended. + 'Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg': [ + 'turf-jsts' + ], + + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ + # used only privately in testing, not used in server code, not modified, not distributed + 'pytest-redis', + # required by pytest-redis (used only where it's used) + 'mirakuru', + ], + + 'GNU General Public License (GPL)': [ + 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs + ], + + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + # 'GNU Lesser General Public License v3 or later (LGPLv3+)', - 'UNLICENSED': [ - # The udn-browser library is our own and has been observed to sometimes show up in some contexts - # as UNLICENSED, when really it's MIT. - # Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE - 'udn-browser', - ], + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Library or Lesser General Public License (LGPL)': [ + 'psycopg2', # Used at runtime during server operation, but not modified or distributed + 'psycopg2-binary', # Used at runtime during server operation, but not modified or distributed + 'chardet', # Potentially used downstream in loadxl to detect charset for text files + 'pyzmq', # Used in post-deploy-perf-tests, not distributed, and not modified or distributed + ], + + 'GPL-2.0': [ + # The license file for the node-forge javascript library says: + # + # "You may use the Forge project under the terms of either the BSD License or the + # GNU General Public License (GPL) Version 2." + # + # (We choose to use it under the BSD license.) + # Ref: https://www.npmjs.com/package/node-forge?activeTab=code + 'node-forge', + ], + + 'MIT*': [ + + # This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. + # (It also mentions that some tools for building/testing use other libraries.) + # Ref: https://github.com/requirejs/domReady/blob/master/LICENSE + 'domready', + + # This library is under 'COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1' + # Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt + # About CDDL ... + # Linking = Permissive, Private Use = ? + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'jsonp', + + # This library says pretty clearly it intends MIT license. + # Ref: https://www.npmjs.com/package/component-indexof + # Linking = Permissive, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'component-indexof', + + # These look like a pretty straight MIT license. + # Linking = Permissive, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'mixin', # LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code + 'stack-trace', # https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE + 'typed-function', # LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code + + ], + + 'UNLICENSED': [ + # The udn-browser library is our own and has been observed to sometimes show up in some contexts + # as UNLICENSED, when really it's MIT. + # Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE + 'udn-browser', + ], + }) + + +@LicenseCheckerRegistry.register_checker('c4-infrastructure') +class C4InfrastructureLicenseChecker(ParkLabCommonServerLicenseChecker): + """ + Checker for C4 infrastructure (Fourfront, CGAP, SMaHT) from Park Lab. + """ - } + LICENSE_TITLE = "(The )?MIT License" +@LicenseCheckerRegistry.register_checker('c4-python-infrastructure') class C4PythonInfrastructureLicenseChecker(C4InfrastructureLicenseChecker): """ - For situations like dcicutils and dcicsnovault where there's no Javascript, this will test just Python. + Checker for C4 python library infrastructure (Fourfront, CGAP, SMaHT) from Park Lab. """ LICENSE_FRAMEWORKS = ['python'] + + +@LicenseCheckerRegistry.register_checker('scan2-pipeline') +class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): + """ + Checker for SCAN2 library from Park Lab. + """ + + EXCEPTIONS = augment( + ParkLabGplPipelineLicenseChecker.EXCEPTIONS, + by={ + 'Custom: Matrix file LICENCE': [ + # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE + # says there are potential extra restrictions beyond a simple GPL license + # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're + # trusting that any other libraries used by Scan2 would have investigated this. + # So, effectively, we think the Matrix library for this situation operates the + # same as if it were just GPL-3 licensed, and we are fine with that. + 'Matrix' + ] + }) diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py new file mode 100644 index 000000000..31b3fa273 --- /dev/null +++ b/dcicutils/scripts/run_license_checker.py @@ -0,0 +1,63 @@ +import argparse + +from dcicutils.command_utils import script_catch_errors, ScriptFailure +from dcicutils.lang_utils import there_are, conjoined_list +from dcicutils.license_utils import LicenseCheckerRegistry, LicenseChecker +from dcicutils.misc_utils import PRINT +from typing import Optional, Type + + +EPILOG = __doc__ + + +ALL_CHECKER_NAMES = LicenseCheckerRegistry.all_checker_names() +NEWLINE = '\n' + + +def main(): + + parser = argparse.ArgumentParser( + description="Runs a license checker", + epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("name", type=str, default=None, nargs='?', + help=f"The name of a checker to run. " + + there_are(ALL_CHECKER_NAMES, kind='available checker', + show=True, joiner=conjoined_list, punctuate=True)) + args = parser.parse_args() + + with script_catch_errors(): + run_license_checker(name=args.name) + + +def show_help_for_choosing_license_checker(): + PRINT("") + PRINT(there_are(ALL_CHECKER_NAMES, kind='available checker', show=False, punctuation_mark=':')) + PRINT("") + wid = max(len(x) for x in ALL_CHECKER_NAMES) + 1 + for checker_name in ALL_CHECKER_NAMES: + checker_class = LicenseCheckerRegistry.lookup_checker(checker_name) + checker_doc = (checker_class.__doc__ or '').strip(' \t\n\r') + PRINT(f"{(checker_name + ':').ljust(wid)} {checker_doc.split(NEWLINE)[0]}") + PRINT("") + PRINT("=" * 42, "NOTES & DISCLAIMERS", "=" * 42) + PRINT("Park Lab is a research laboratory in the Department of Biomedical Informatics at Harvard Medical School.") + PRINT("Park Lab checkers are intended for internal use and may not be suitable for other purposes.") + PRINT("External organizations must make their own independent choices about license acceptability.") + PRINT("Such choices can be integrated with this tool as follows:") + PRINT(" * Import LicenseChecker and LicenseCheckerRegistry from dcicutils.license_utils.") + PRINT(" * Make your own subclass of LicenseChecker, specifying a doc string and appropriate constraints.") + PRINT(" * Decorate your subclass with an appropriate call to LicenseCheckerRegistry.register_checker.") + PRINT("") + + +def run_license_checker(name: Optional[str]): + if name is None: + show_help_for_choosing_license_checker() + else: + try: + checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name) + except Exception as e: + raise ScriptFailure(str(e)) + checker_class.validate() diff --git a/pyproject.toml b/pyproject.toml index 65dba0353..b0bd73490 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0" +version = "7.11.0.1b0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -81,6 +81,8 @@ pytest-runner = ">=5.1" [tool.poetry.scripts] publish-to-pypi = "dcicutils.scripts.publish_to_pypi:main" show-contributors = "dcicutils.contribution_scripts:show_contributors_main" +run-license-checker = "dcicutils.scripts.run_license_checker:main" + [tool.pytest.ini_options] addopts = "--basetemp=/tmp/pytest" diff --git a/test/test_license_utils.py b/test/test_license_utils.py index 78eba6905..1779f4cca 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -8,7 +8,8 @@ from collections import defaultdict from dcicutils.license_utils import ( - LicenseFrameworkRegistry, LicenseFramework, PythonLicenseFramework, JavascriptLicenseFramework, + LicenseFrameworkRegistry, LicenseFramework, + PythonLicenseFramework, JavascriptLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, @@ -134,12 +135,12 @@ def test_license_framework_registry_register(): # decorator with LicenseFrameworkRegistry.temporary_registration_for_testing(): with pytest.raises(ValueError): - @LicenseFrameworkRegistry.register(name='bogus_dummy') + @LicenseFrameworkRegistry.register_framework(name='bogus_dummy') class BogusDummyLicenseFramework: pass ignored(BogusDummyLicenseFramework) - @LicenseFrameworkRegistry.register(name='dummy') + @LicenseFrameworkRegistry.register_framework(name='dummy') class DummyLicenseFramework(LicenseFramework): pass @@ -159,6 +160,7 @@ def test_license_framework_registry_all_frameworks(): assert sorted(frameworks, key=lambda x: x.NAME) == [ JavascriptLicenseFramework, PythonLicenseFramework, + RLicenseFramework ] @@ -166,7 +168,7 @@ def test_license_framework_registry_find_framework(): with LicenseFrameworkRegistry.temporary_registration_for_testing(): - @LicenseFrameworkRegistry.register(name='dummy1') + @LicenseFrameworkRegistry.register_framework(name='dummy1') class DummyLicenseFramework1(LicenseFramework): pass @@ -187,7 +189,8 @@ class DummyLicenseFramework1(LicenseFramework): def test_javascript_license_framework_implicated_licenses(): def check_implications(spec, implications): - assert JavascriptLicenseFramework.implicated_licenses(licenses_spec=spec) == implications + assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored', + licenses_spec=spec) == implications check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) @@ -216,10 +219,10 @@ def test_javascript_license_framework_get_licenses(): mock_check_output.return_value = subprocess_output with printed_output() as printed: assert JavascriptLicenseFramework.get_dependencies() == [ - {'licenses': ['Apache-2.0'], 'name': 'package1'}, - {'licenses': ['MIT'], 'name': 'package2'}, - {'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, - {'licenses': [], 'name': 'package4'}, + {'framework': 'javascript', 'licenses': ['Apache-2.0'], 'name': 'package1'}, + {'framework': 'javascript', 'licenses': ['MIT'], 'name': 'package2'}, + {'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, + {'framework': 'javascript', 'licenses': [], 'name': 'package4'}, ] assert printed.lines == [ "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']" @@ -625,6 +628,7 @@ def test_license_checker_analyze_license_dependencies_by_framework(): assert mock_analyze.mock_calls == [ mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=JavascriptLicenseFramework), mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=PythonLicenseFramework), + mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=RLicenseFramework), ] From 1512225e14062ea3e096f321c5080bc9cfd3f04b Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 15 Sep 2023 03:45:18 -0400 Subject: [PATCH 07/24] Add support for conda licenses. Allow the list of filenames accepted to include regexps. Add a lot of licenses. --- CHANGELOG.rst | 4 + dcicutils/license_utils.py | 353 ++++++++++++++++++----- dcicutils/misc_utils.py | 12 +- dcicutils/scripts/run_license_checker.py | 9 +- pyproject.toml | 2 +- test/test_license_utils.py | 4 +- test/test_misc_utils.py | 12 +- 7 files changed, 313 insertions(+), 83 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 99d4da9ce..4832109e7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -33,6 +33,10 @@ Change Log | ``Scan2PipelineLicenseChecker`` | ``scan2-pipeline`` | New | +------------------------------------------+--------------------------------+----------------+ +* In ``misc_utils``: + + * New function ``json_file_contents`` + * In ``scripts``: * Add a ``run-license-checker`` script, implemented by ``run_license_checker.py``, diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 8f6b9cefb..f13e3774d 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -1,6 +1,7 @@ import contextlib import csv import datetime +import glob import io import json # import logging @@ -30,7 +31,7 @@ # to modules, not relative references. Later when things are better installed, we can make refs relative again. from dcicutils.exceptions import InvalidParameterError from dcicutils.lang_utils import there_are -from dcicutils.misc_utils import PRINT, get_error_message, local_attrs, ignored +from dcicutils.misc_utils import PRINT, get_error_message, local_attrs, ignored, json_file_contents T = TypeVar("T") @@ -46,6 +47,10 @@ _STATUS = 'status' +def pattern(x): + return re.compile(x, re.IGNORECASE) + + def augment(d: dict, by: dict): return dict(d, **by) @@ -124,25 +129,90 @@ def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) +def extract_boolean_terms(boolean_expression: str) -> List[str]: + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + terms = sorted(map(lambda x: x.strip(), + (boolean_expression + .replace('(', '') + .replace(')', '') + .replace(' AND ', ',') + .replace(' and ', ',') + .replace(' & ', ',') + .replace(' OR ', ',') + .replace(' or ', ',') + .replace('|', ',') + .replace(';', ',') + .replace(' + ', ',') + ).split(','))) + return terms + + +# This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' +# It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. +# It will also correctly handle the unlikely case of ' (= 3 or greater)' + +_OR_LATER_PATTERN = '(?:[- ]or[ -](?:greater|later))' +_PARENTHETICAL_VERSION_CONSTRAINT = re.compile(f'( [(]([>]?)(?:[=]|version) ([0-9.]+)({_OR_LATER_PATTERN}?)[)])') +_POSTFIX_OR_LATER_PATTERN = re.compile(f"({_OR_LATER_PATTERN})") +_GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') + + +def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: + m = _GPL_VERSION_CHOICE.match(licenses_spec) + if m: + version_a, version_b = m.groups() + return f"GPL-{version_a}-or-{version_b}" + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + transform_count = 0 + original_licenses_spec = licenses_spec + while True: + if transform_count > 100: # It'd be surprising if there were even ten of these to convert. + warnings.warn(f"Transforming {for_package_name} {licenses_spec!r} seemed to be looping." + f" Please report this as a bug.") + return licenses_spec # return the unmodified + transform_count += 1 + m = _PARENTHETICAL_VERSION_CONSTRAINT.search(licenses_spec) + if not m: + break + matched, greater, version_spec, greater2 = m.groups() + is_greater = bool(greater or greater2) + licenses_spec = licenses_spec.replace(matched, + f"-{version_spec}" + f"{'+' if is_greater else ''}") + print(f"REWRITING1: {licenses_spec}") + transform_count = 0 + while True: + if transform_count > 100: # It'd be surprising if there were even ten of these to convert. + warnings.warn(f"Transforming {for_package_name} {licenses_spec!r} seemed to be looping." + f" Please report this as a bug.") + return licenses_spec # return the unmodified + transform_count += 1 + m = _POSTFIX_OR_LATER_PATTERN.search(licenses_spec) + if not m: + break + matched = m.group(1) + licenses_spec = licenses_spec.replace(matched, '+') + print(f"REWRITING2: {licenses_spec}") + if licenses_spec != original_licenses_spec: + PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + return licenses_spec + + @LicenseFrameworkRegistry.register_framework(name='javascript') class JavascriptLicenseFramework(LicenseFramework): @classmethod def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: ignored(package_name) - # We only care which licenses were mentioned, not what algebra is used on them. - # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) - # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". - # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will - # have trouble passing unless both MIT and GPL-3.0 are allowed. - licenses = sorted(map(lambda x: x.strip(), - (licenses_spec - .replace('(', '') - .replace(')', '') - .replace(' AND ', ',') - .replace(' OR ', ',') - ).split(','))) - return licenses + return extract_boolean_terms(licenses_spec) @classmethod def get_dependencies(cls): @@ -199,47 +269,52 @@ def get_dependencies(cls): return sorted(result, key=lambda x: x.get(_NAME).lower()) +@LicenseFrameworkRegistry.register_framework(name='conda') +class CondaLicenseFramework(LicenseFramework): + + @classmethod + def get_dependencies(cls): + prefix = os.environ.get("CONDA_LICENSE_CHECKER_PREFIX", os.environ.get("CONDA_PREFIX", "")) + result = [] + filespec = os.path.join(prefix, "conda-meta/*.json") + files = glob.glob(filespec) + for file in files: + data = json_file_contents(file) + package_name = data['name'] + package_license = data.get('license') or "MISSING" + if package_license: + # print(f"package_license={package_license}") + simplified_package_license_spec = simplify_license_versions(package_license, + for_package_name=package_name) + # print(f" =simplified_package_license_spec => {simplified_package_license_spec}") + package_licenses = extract_boolean_terms(simplified_package_license_spec) + # print(f"=> {package_licenses}") + else: + package_licenses = [] + entry = { + _NAME: package_name, + _LICENSES: package_licenses, + _FRAMEWORK: 'conda', + } + result.append(entry) + # print(f"conda get_dependencies result={json.dumps(result, indent=2)}") + # print("conda deps = ", json.dumps(result, indent=2)) + return result + + @LicenseFrameworkRegistry.register_framework(name='r') class RLicenseFramework(LicenseFramework): VERBOSE = False R_PART_SPEC = re.compile("^Part of R [0-9.]+$") - # This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' - # It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. - # It will also correctly handle the unlikely case of ' (= 3 or greater)' - # or will - VERSION_SPEC = re.compile('( [(]([>]?)(?:[=]|version) ([0-9.]+)((?: or (?:greater|later))?)[)])') - GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') R_LANGUAGE_LICENSE_NAME = 'R-language-license' @classmethod def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: if cls.R_PART_SPEC.match(licenses_spec): return [cls.R_LANGUAGE_LICENSE_NAME] - m = cls.GPL_VERSION_CHOICE.match(licenses_spec) - if m: - version_a, version_b = m.groups() - return [f"GPL-{version_a}-or-{version_b}"] - # We only care which licenses were mentioned, not what algebra is used on them. - # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) - # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". - # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will - # have trouble passing unless both MIT and GPL-3.0 are allowed. - n = 0 - original_licenses_spec = licenses_spec - while n < 1000: # just in case of an infinite loop - n += 1 - m = cls.VERSION_SPEC.search(licenses_spec) - if not m: - break - matched, greater, version_spec, greater2 = m.groups() - greater = greater or greater2 - licenses_spec = licenses_spec.replace(matched, - f"-{version_spec}" - f"{'+' if greater else ''}") - if licenses_spec != original_licenses_spec: - PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) licenses = sorted(map(lambda x: x.strip(), (licenses_spec .replace('|', ',') @@ -475,6 +550,22 @@ def analyze_license_file(cls, *, analysis: LicenseAnalysis, check_license_title=license_title or cls.LICENSE_TITLE, analysis=analysis) + CHOICE_REGEXPS = {} + + @classmethod + def _make_regexp_for_choices(cls, choices): + inner_pattern = '|'.join('^' + (re.escape(choice) if isinstance(choice, str) else choice.pattern) + '$' + for choice in choices) or "^$" + return re.compile(f"({inner_pattern})", re.IGNORECASE) + + @classmethod + def _find_regexp_for_choices(cls, choices): + key = str(choices) + regexp = cls.CHOICE_REGEXPS.get(key) + if not regexp: + cls.CHOICE_REGEXPS[key] = regexp = cls._make_regexp_for_choices(choices) + return regexp + @classmethod def analyze_license_dependencies_for_framework(cls, *, analysis: LicenseAnalysis, @@ -482,7 +573,7 @@ def analyze_license_dependencies_for_framework(cls, *, acceptable: Optional[List[str]] = None, exceptions: Optional[Dict[str, str]] = None, ) -> None: - acceptable = (acceptable or []) + (cls.ALLOWED or []) + acceptability_regexp = cls._find_regexp_for_choices((acceptable or []) + (cls.ALLOWED or [])) exceptions = dict(cls.EXCEPTIONS or {}, **(exceptions or {})) try: @@ -512,7 +603,7 @@ def analyze_license_dependencies_for_framework(cls, *, by_special_exception = False for license_name in license_names: special_exceptions = exceptions.get(license_name, []) - if license_name in acceptable: + if acceptability_regexp.match(license_name): # license_name in acceptable: pass elif name in special_exceptions: by_special_exception = True @@ -556,7 +647,7 @@ def analyze_license_dependencies_by_framework(cls, *, def show_unacceptable_licenses(cls, *, analysis: LicenseAnalysis) -> LicenseAnalysis: if analysis.unacceptable: PRINT(there_are(analysis.unacceptable, kind="unacceptable license", show=False, punctuation_mark=':')) - for license, names in analysis.unacceptable.items(): + for license, names in sorted(analysis.unacceptable.items()): PRINT(f" {license}: {', '.join(names)}") return analysis @@ -666,16 +757,39 @@ class ParkLabCommonLicenseChecker(LicenseChecker): 'AFL-2.1', # Linking = Permissive, Private Use = Yes + # Apache licenses before version 2.0 are controversial, but we here construe an unmarked naming to imply + # any version, and hence v2. # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'Apache Software License', 'Apache-Style', - 'Apache-2.0', + pattern("Apache([- ]2([.]0)?)?([- ]Licen[cs]e)?([- ]with[- ]LLVM[- ]exception)?"), + # 'Apache-2.0', + + # Artistic License 1.0 was confusing to people, so its status as permissive is in general uncertain, + # however the issue seems to revolve around point 8 (relating to whether or not perl is deliberately + # exposed). That isn't in play for our uses, so we don't flag it here. + # Artistic license 2.0 is a permissive license. + # Ref: https://en.wikipedia.org/wiki/Artistic_License + 'Artistic-1.0-Perl', + pattern('Artistic[- ]2([.]0)?'), + + # According to Wikipedia, the Boost is considered permissive and BSD-like. + # Refs: + # * + # * https://en.wikipedia.org/wiki/Boost_(C%2B%2B_libraries)#License + pattern('(BSL|Boost(([- ]Software)?[- ]License)?)([- ]1([.]0)?)?'), # Linking = Permissive, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'BSD License', - 'BSD-2-Clause', - 'BSD-3-Clause', + pattern('((modified[- ])?[234][- ]Clause[- ])?BSD([- ][234][- ]Clause)?( Licen[cs]e)?'), + # 'BSD License', + # 'BSD-2-Clause', + # 'BSD-3-Clause', + # 'BSD 3-Clause', + + # BZIP2 is a permissive license + # Ref: https://github.com/asimonov-im/bzip2/blob/master/LICENSE + pattern('bzip2(-1[.0-9]*)'), # Linking = Public Domain, Private Use = Public Domain # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses @@ -688,6 +802,10 @@ class ParkLabCommonLicenseChecker(LicenseChecker): 'CC-BY-3.0', 'CC-BY-4.0', + # The curl license is a permissive license. + # Ref: https://curl.se/docs/copyright.html + 'curl', + # Linking = Permissive, Private Use = ? # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'CDDL', @@ -708,9 +826,25 @@ class ParkLabCommonLicenseChecker(LicenseChecker): 'FSF Unlimited License', 'FSFUL', + # The FreeType license is a permissive license. + # Ref: LicenseRef-FreeType + pattern('(Licen[cs]eRef-)?(FTL|FreeType( Licen[cs]e)?)'), + # Linking = Yes, Cat = Permissive Software Licenses # Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer 'Historical Permission Notice and Disclaimer (HPND)', + 'HPND', + pattern('(Licen[cs]eRef-)?PIL'), + # The Pillow or Python Image Library is an HPND license, which is a simple permissive license: + # Refs: + # * https://github.com/python-pillow/Pillow/blob/main/LICENSE + # * https://www.fsf.org/blogs/licensing/historical-permission-notice-and-disclaimer-added-to-license-list + + # The IJG license, used by Independent JPEG Group (IJG) is a custom permissive license. + # Refs: + # * https://en.wikipedia.org/wiki/Libjpeg + # * https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/LICENSE.md + 'IJG', # Linking = Permissive, Private Use = Permissive # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses @@ -735,10 +869,11 @@ class ParkLabCommonLicenseChecker(LicenseChecker): 'OFL-1.1', # Ref: https://en.wikipedia.org/wiki/Public_domain - 'Public Domain', + pattern('(Licen[cs]eRef-)?Public[- ]Domain([- ]dedic[t]?ation)?'), # "dedictation" is a typo in docutils # Linking = Permissive, Private Use = Permissive # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + pattern('(Licen[cs]eRef-)?PSF-2([.][.0-9]*)'), 'Python Software Foundation License', 'Python-2.0', @@ -746,6 +881,17 @@ class ParkLabCommonLicenseChecker(LicenseChecker): # Ref: https://en.wikipedia.org/wiki/Pylons_project 'Repoze Public License', + # The TCL or Tcl/Tk licenses are permissive licenses. + # Ref: https://www.tcl.tk/software/tcltk/license.html + # The one used by the tktable library has a 'bourbon' clause that doesn't add compliance requirements + # Ref: https://github.com/wjoye/tktable/blob/master/license.txt + pattern('Tcl([/]tk)?'), + + # The Ubuntu Font Licence is mostly permissive. It contains some restrictions if you are going to modify the + # fonts that require you to change the name to avoid confusion. But for our purposes, we're assuming that's + # not done, and so we're not flagging it. + pattern('Ubuntu Font Licen[cs]e Version( 1([.]0)?)?'), + # Linking = Permissive/Public domain, Private Use = Permissive/Public domain # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'The Unlicense (Unlicense)', @@ -798,10 +944,65 @@ class ParkLabCommonLicenseChecker(LicenseChecker): 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed ], + # Linking = With Restrictions, Private Use = Yes + # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ + # used only privately in testing, not used in server code, not modified, not distributed + 'pytest-redis', + # required by pytest-redis (used only where it's used) + 'mirakuru', + ], + 'GNU General Public License (GPL)': [ 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs ], + 'MIT/X11 Derivative': [ + # The license used by libxkbcommon is complicated and involves numerous included licenses, + # but all are permissive. + # Ref: https://github.com/xkbcommon/libxkbcommon/blob/master/LICENSE + 'libxkbcommon', + ], + + 'None': [ + # It's not obvious why Conda shows this license as 'None'. + # In fact, though, BSD 3-Clause "New" or "Revised" License + # Ref: https://github.com/AnacondaRecipes/_libgcc_mutex-feedstock/blob/master/LICENSE.txt + '_libgcc_mutex', + ], + + 'PostgreSQL': [ + # The libpq library is actually licensed with a permissive BSD 3-Clause "New" or "Revised" License + # Ref: https://github.com/lpsmith/postgresql-libpq/blob/master/LICENSE + 'libpq', + ], + + 'UCSD': [ + # It isn't obvious why these show up with a UCSD license in Conda. + # The actual sources say it should be a 2-clause BSD license: + # Refs: + # * https://github.com/AlexandrovLab/SigProfilerMatrixGenerator/blob/master/LICENSE + # * https://github.com/AlexandrovLab/SigProfilerPlotting/blob/master/LICENSE + 'sigprofilermatrixgenerator', + 'sigprofilerplotting', + ], + + 'X11': [ + # The ncurses library has a VERY complicated history, BUT seems consistently permissive + # and the most recent version seems to be essentially the MIT license. + # Refs: + # * https://en.wikipedia.org/wiki/Ncurses#License + # * https://invisible-island.net/ncurses/ncurses-license.html + 'ncurses' + ], + + 'zlib-acknowledgement': [ + # It isn't clear whey libpng shows up with this license name, but the license for libpng + # is a permissive license. + # Ref: https://github.com/glennrp/libpng/blob/libpng16/LICENSE + 'libpng', + ], + } EXPECTED_MISSING_LICENSES = [ @@ -922,33 +1123,46 @@ class ParkLabGplPipelineLicenseChecker(ParkLabCommonLicenseChecker): Minimal checker common to GPL pipelines from Park Lab. """ - LICENSE_FRAMEWORKS = ['python', 'r'] # TODO: Implement 'conda' and add it here. + LICENSE_FRAMEWORKS = ['python', 'conda', 'r'] ALLOWED = ParkLabCommonLicenseChecker.ALLOWED + [ # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v2 or later (LGPLv2+)', - 'LGPL-v2', 'LGPL-v2.0', 'LGPL-2', 'LGPL-2.0', - 'LGPL-v2+', 'LGPL-v2.0+', 'LGPL-2+', 'LGPL-2.0+', - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v3 or later (LGPLv3+)', - 'LGPL-v3', 'LGPL-v3.0', 'LGPL-3', 'LGPL-3.0', - 'LGPL-v3+', 'LGPL-v3.0+', 'LGPL-3+', 'LGPL-3.0+', + # The "exceptions", if present, indicate waivers to source delivery requirements. + # Ref: https://spdx.org/licenses/LGPL-3.0-linking-exception.html + pattern('GNU Lesser General Public License v2( or later)?( [(]LGPL[v]?[23][+]?[)])?'), + # 'GNU Lesser General Public License v2 or later (LGPLv2+)', + # 'GNU Lesser General Public License v3 or later (LGPLv3+)', + # 'LGPLv2', 'LGPL-v2', 'LGPL-v2.0', 'LGPL-2', 'LGPL-2.0', + # 'LGPLv2+', 'LGPL-v2+', 'LGPL-v2.0+', 'LGPL-2+', 'LGPL-2.0+', + # 'LGPLv3', 'LGPL-v3', 'LGPL-v3.0', 'LGPL-3', 'LGPL-3.0', + # 'LGPLv3+', 'LGPL-v3+', 'LGPL-v3.0+', 'LGPL-3+', 'LGPL-3.0+', + pattern('LGPL[v-]?[.0-9]*([+]|-only)?([- ]with[- ]exceptions)?'), # Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK # for pipeline or server use as long as we're not distributing sources. 'LGPL', 'GNU Library or Lesser General Public License (LGPL)', + # GPL + # * library exception operates like LGPL + # * classpath exception is a linking exception related to Oracle + # Refs: + # * https://www.gnu.org/licenses/old-licenses/gpl-1.0.en.html + # * https://spdx.org/licenses/GPL-2.0-with-GCC-exception.html + # * https://spdx.org/licenses/GPL-3.0-with-GCC-exception.html + pattern('(GNU General Public License|GPL)[ ]?[v-]?[123]([.]0)?([+]|[- ]only)?' + '([- ]with[- ]GCC(([- ]runtime)?[- ]library)?[- ]exception([- ][.0-9]*)?)?' + '([- ]with[- ]Classpath[- ]exception([- ][.0-9]+)?)?'), + # Linking = "GPLv3 compatible only", Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GPL-2-or-3', - 'GPL-v2+', 'GPL-v2.0+', 'GPL-2+', 'GPL-2.0+', - 'GPL-v3', 'GPL-v3.0', 'GPL-3', 'GPL-3.0', - 'GPL-v3+', 'GPL-v3.0+', 'GPL-3+', 'GPL-3.0+', + 'GPL-2-or-3', # we sometimes generate this token + # 'GPLv2+', 'GPL-v2+', 'GPL-v2.0+', 'GPL-2+', 'GPL-2.0+', + # 'GPLv3', 'GPL-v3', 'GPL-v3.0', 'GPL-3', 'GPL-3.0', + # 'GPLv3+', 'GPL-v3+', 'GPL-v3.0+', 'GPL-3+', 'GPL-3.0+', + # 'GPLv3-only', 'GPL-3-only', 'GPL-v3-only', 'GPL-3.0-only', 'GPL-v3.0-only', # Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. # And version 3 is our preferred interpretation. @@ -1016,15 +1230,6 @@ class ParkLabCommonServerLicenseChecker(ParkLabCommonLicenseChecker): 'turf-jsts' ], - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ - # used only privately in testing, not used in server code, not modified, not distributed - 'pytest-redis', - # required by pytest-redis (used only where it's used) - 'mirakuru', - ], - 'GNU General Public License (GPL)': [ 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs ], diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index cc18f4b19..115fd00ff 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -7,10 +7,11 @@ import functools import hashlib import inspect -import math import io -import os +import json import logging +import math +import os import pytz import re import rfc3986.validators @@ -20,8 +21,8 @@ import webtest # importing the library makes it easier to mock testing from collections import defaultdict -from dateutil.parser import parse as dateutil_parse from datetime import datetime as datetime_type +from dateutil.parser import parse as dateutil_parse from typing import Optional @@ -1310,6 +1311,11 @@ def file_contents(filename, binary=False): return fp.read() +def json_file_contents(filename): + with io.open(filename, 'r') as fp: + return json.load(fp) + + def camel_case_to_snake_case(s, separator='_'): """ Converts CamelCase to snake_case. diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py index 31b3fa273..33ff4b90c 100644 --- a/dcicutils/scripts/run_license_checker.py +++ b/dcicutils/scripts/run_license_checker.py @@ -2,8 +2,8 @@ from dcicutils.command_utils import script_catch_errors, ScriptFailure from dcicutils.lang_utils import there_are, conjoined_list -from dcicutils.license_utils import LicenseCheckerRegistry, LicenseChecker -from dcicutils.misc_utils import PRINT +from dcicutils.license_utils import LicenseCheckerRegistry, LicenseChecker, LicenseCheckFailure +from dcicutils.misc_utils import PRINT, get_error_message from typing import Optional, Type @@ -60,4 +60,7 @@ def run_license_checker(name: Optional[str]): checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name) except Exception as e: raise ScriptFailure(str(e)) - checker_class.validate() + try: + checker_class.validate() + except LicenseCheckFailure as e: + raise ScriptFailure(get_error_message(e)) diff --git a/pyproject.toml b/pyproject.toml index b0bd73490..d107a498e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0.1b0" +version = "7.11.0.1b1" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_license_utils.py b/test/test_license_utils.py index 1779f4cca..36691b39f 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -9,7 +9,7 @@ from collections import defaultdict from dcicutils.license_utils import ( LicenseFrameworkRegistry, LicenseFramework, - PythonLicenseFramework, JavascriptLicenseFramework, RLicenseFramework, + PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, @@ -158,6 +158,7 @@ def test_license_framework_registry_all_frameworks(): assert all(isinstance(framework, type) and issubclass(framework, LicenseFramework) for framework in frameworks) assert sorted(frameworks, key=lambda x: x.NAME) == [ + CondaLicenseFramework, JavascriptLicenseFramework, PythonLicenseFramework, RLicenseFramework @@ -626,6 +627,7 @@ def test_license_checker_analyze_license_dependencies_by_framework(): analysis = LicenseAnalysis() LicenseChecker.analyze_license_dependencies_by_framework(analysis=analysis, frameworks=None) assert mock_analyze.mock_calls == [ + mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=CondaLicenseFramework), mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=JavascriptLicenseFramework), mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=PythonLicenseFramework), mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=RLicenseFramework), diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index a07c6d234..778aabac3 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -30,7 +30,7 @@ classproperty, classproperty_cached, classproperty_cached_each_subclass, Singleton, NamedObject, obsolete, ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, - MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, + MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents, ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, @@ -1788,6 +1788,16 @@ def test_file_contents(): assert file_contents("foo.bin", binary=False) == 'Hello!\n' +def test_json_file_contents(): + + mfs = MockFileSystem() + sample_data = {"foo": 1, "bar": [2, True]} + with mock.patch("io.open", mfs.open): + with io.open("foo.txt", 'w') as fp: + json.dump(sample_data, fp) + assert json_file_contents("foo.txt") == sample_data + + def test_make_counter(): counter = make_counter() From 25bd05a8c570ee26df3def7ffdcc6527423a1073 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 15 Sep 2023 03:59:21 -0400 Subject: [PATCH 08/24] Remove some debugging typeout. --- dcicutils/license_utils.py | 10 +++++----- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index f13e3774d..8452d3823 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -187,7 +187,7 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: licenses_spec = licenses_spec.replace(matched, f"-{version_spec}" f"{'+' if is_greater else ''}") - print(f"REWRITING1: {licenses_spec}") + # print(f"REWRITING1: {licenses_spec}") transform_count = 0 while True: if transform_count > 100: # It'd be surprising if there were even ten of these to convert. @@ -200,9 +200,9 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: break matched = m.group(1) licenses_spec = licenses_spec.replace(matched, '+') - print(f"REWRITING2: {licenses_spec}") - if licenses_spec != original_licenses_spec: - PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + # print(f"REWRITING2: {licenses_spec}") + # if licenses_spec != original_licenses_spec: + # print(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") return licenses_spec @@ -229,7 +229,7 @@ def get_dependencies(cls): licenses_spec = record.get(_LICENSES) if '(' in licenses_spec: licenses = cls.implicated_licenses(package_name=name, licenses_spec=licenses_spec) - PRINT(f"Rewriting {licenses_spec!r} as {licenses!r}") + # print(f"Rewriting {licenses_spec!r} as {licenses!r}") elif licenses_spec: licenses = [licenses_spec] else: diff --git a/pyproject.toml b/pyproject.toml index d107a498e..97300d791 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0.1b1" +version = "7.11.0.1b2" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 869471cfd97546f34499f0204be0119b470451f2 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 15 Sep 2023 04:01:43 -0400 Subject: [PATCH 09/24] PEP8 --- dcicutils/license_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 8452d3823..4164af741 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -31,7 +31,7 @@ # to modules, not relative references. Later when things are better installed, we can make refs relative again. from dcicutils.exceptions import InvalidParameterError from dcicutils.lang_utils import there_are -from dcicutils.misc_utils import PRINT, get_error_message, local_attrs, ignored, json_file_contents +from dcicutils.misc_utils import PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs T = TypeVar("T") @@ -173,6 +173,7 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: # have trouble passing unless both MIT and GPL-3.0 are allowed. transform_count = 0 original_licenses_spec = licenses_spec + ignorable(original_licenses_spec) # sometimes useful for debugging while True: if transform_count > 100: # It'd be surprising if there were even ten of these to convert. warnings.warn(f"Transforming {for_package_name} {licenses_spec!r} seemed to be looping." From 50a94d2b362c342b94c825a90df732de6775e7bb Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 15 Sep 2023 04:17:20 -0400 Subject: [PATCH 10/24] Some other shuffling of available license checkers. --- dcicutils/license_utils.py | 53 ++++++++++++++++++++++---------------- pyproject.toml | 2 +- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 4164af741..d76dd6a23 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -1118,15 +1118,22 @@ class ParkLabCommonLicenseChecker(LicenseChecker): ] +@LicenseCheckerRegistry.register_checker('park-lab-pipeline') +class ParkLabPipelineLicenseChecker(ParkLabCommonLicenseChecker): + """ + Minimal checker common to pipelines from Park Lab. + """ + + LICENSE_FRAMEWORKS = ['python', 'conda', 'r'] + + @LicenseCheckerRegistry.register_checker('park-lab-gpl-pipeline') class ParkLabGplPipelineLicenseChecker(ParkLabCommonLicenseChecker): """ Minimal checker common to GPL pipelines from Park Lab. """ - LICENSE_FRAMEWORKS = ['python', 'conda', 'r'] - - ALLOWED = ParkLabCommonLicenseChecker.ALLOWED + [ + ALLOWED = ParkLabPipelineLicenseChecker.ALLOWED + [ # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses @@ -1314,22 +1321,24 @@ class C4PythonInfrastructureLicenseChecker(C4InfrastructureLicenseChecker): LICENSE_FRAMEWORKS = ['python'] -@LicenseCheckerRegistry.register_checker('scan2-pipeline') -class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): - """ - Checker for SCAN2 library from Park Lab. - """ - - EXCEPTIONS = augment( - ParkLabGplPipelineLicenseChecker.EXCEPTIONS, - by={ - 'Custom: Matrix file LICENCE': [ - # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE - # says there are potential extra restrictions beyond a simple GPL license - # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're - # trusting that any other libraries used by Scan2 would have investigated this. - # So, effectively, we think the Matrix library for this situation operates the - # same as if it were just GPL-3 licensed, and we are fine with that. - 'Matrix' - ] - }) +# Need to figure out if this shoul duse the regular pipeline checker of the GPL checker as the parent here. +# +# @LicenseCheckerRegistry.register_checker('scan2-pipeline') +# class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): +# """ +# Checker for SCAN2 library from Park Lab. +# """ +# +# EXCEPTIONS = augment( +# ParkLabGplPipelineLicenseChecker.EXCEPTIONS, +# by={ +# 'Custom: Matrix file LICENCE': [ +# # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE +# # says there are potential extra restrictions beyond a simple GPL license +# # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're +# # trusting that any other libraries used by Scan2 would have investigated this. +# # So, effectively, we think the Matrix library for this situation operates the +# # same as if it were just GPL-3 licensed, and we are fine with that. +# 'Matrix' +# ] +# }) diff --git a/pyproject.toml b/pyproject.toml index 97300d791..8ff28816b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0.1b2" +version = "7.11.0.1b3" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 521fc83d2e399a3d3f62394b337436b15eaddef6 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Tue, 19 Sep 2023 14:44:56 -0400 Subject: [PATCH 11/24] Minor light refactoring. Mostly addition of new licenses. --- dcicutils/license_utils.py | 126 ++++++++++++++++++++++++++----------- pyproject.toml | 2 +- test/test_license_utils.py | 71 ++++++++++----------- 3 files changed, 125 insertions(+), 74 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index d76dd6a23..14dd74ca1 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -31,7 +31,9 @@ # to modules, not relative references. Later when things are better installed, we can make refs relative again. from dcicutils.exceptions import InvalidParameterError from dcicutils.lang_utils import there_are -from dcicutils.misc_utils import PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs +from dcicutils.misc_utils import ( + PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool, +) T = TypeVar("T") @@ -63,9 +65,13 @@ class LicenseStatus: UNEXPECTED_MISSING = "UNEXPECTED_MISSING" +LICENSE_UTILS_VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=False) + + class LicenseFramework: NAME = None + VERBOSE = LICENSE_UTILS_VERBOSE @classmethod def get_dependencies(cls): @@ -129,7 +135,7 @@ def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) -def extract_boolean_terms(boolean_expression: str) -> List[str]: +def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]: # We only care which licenses were mentioned, not what algebra is used on them. # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". @@ -147,6 +153,7 @@ def extract_boolean_terms(boolean_expression: str) -> List[str]: .replace('|', ',') .replace(';', ',') .replace(' + ', ',') + .replace('file ', f'Custom: {for_package_name} file ') ).split(','))) return terms @@ -161,7 +168,7 @@ def extract_boolean_terms(boolean_expression: str) -> List[str]: _GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') -def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: +def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose: bool = False) -> str: m = _GPL_VERSION_CHOICE.match(licenses_spec) if m: version_a, version_b = m.groups() @@ -188,7 +195,6 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: licenses_spec = licenses_spec.replace(matched, f"-{version_spec}" f"{'+' if is_greater else ''}") - # print(f"REWRITING1: {licenses_spec}") transform_count = 0 while True: if transform_count > 100: # It'd be surprising if there were even ten of these to convert. @@ -201,9 +207,8 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: break matched = m.group(1) licenses_spec = licenses_spec.replace(matched, '+') - # print(f"REWRITING2: {licenses_spec}") - # if licenses_spec != original_licenses_spec: - # print(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + if verbose and licenses_spec != original_licenses_spec: + print(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") return licenses_spec @@ -213,7 +218,7 @@ class JavascriptLicenseFramework(LicenseFramework): @classmethod def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: ignored(package_name) - return extract_boolean_terms(licenses_spec) + return extract_boolean_terms(licenses_spec, for_package_name=package_name) @classmethod def get_dependencies(cls): @@ -288,7 +293,8 @@ def get_dependencies(cls): simplified_package_license_spec = simplify_license_versions(package_license, for_package_name=package_name) # print(f" =simplified_package_license_spec => {simplified_package_license_spec}") - package_licenses = extract_boolean_terms(simplified_package_license_spec) + package_licenses = extract_boolean_terms(simplified_package_license_spec, + for_package_name=package_name) # print(f"=> {package_licenses}") else: package_licenses = [] @@ -298,6 +304,7 @@ def get_dependencies(cls): _FRAMEWORK: 'conda', } result.append(entry) + result.sort(key=lambda x: x['name']) # print(f"conda get_dependencies result={json.dumps(result, indent=2)}") # print("conda deps = ", json.dumps(result, indent=2)) return result @@ -306,8 +313,6 @@ def get_dependencies(cls): @LicenseFrameworkRegistry.register_framework(name='r') class RLicenseFramework(LicenseFramework): - VERBOSE = False - R_PART_SPEC = re.compile("^Part of R [0-9.]+$") R_LANGUAGE_LICENSE_NAME = 'R-language-license' @@ -316,15 +321,21 @@ def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: if cls.R_PART_SPEC.match(licenses_spec): return [cls.R_LANGUAGE_LICENSE_NAME] licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) - licenses = sorted(map(lambda x: x.strip(), - (licenses_spec - .replace('|', ',') - .replace('file ', f'Custom: {package_name} file ') - ).split(','))) + licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) + # licenses = sorted(map(lambda x: x.strip(), + # (licenses_spec + # .replace('|', ',') + # .replace('file ', f'Custom: {package_name} file ') + # ).split(','))) return licenses @classmethod def get_dependencies(cls): + # NOTE: Although the R Language itself is released under the GPL, our belief is that it is + # still possible to write programs in R that are not GPL, even programs that use commercial licenses. + # So we do ordinary license checking here, same as in other frameworks. + # For notes on this, see the R FAQ. + # Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Can-I-use-R-for-commercial-purposes_003f _PACKAGE = "Package" _LICENSE = "License" @@ -365,7 +376,7 @@ def get_dependencies(cls): class LicenseFileParser: - VERBOSE = False + VERBOSE = LICENSE_UTILS_VERBOSE SEPARATORS = '-.,' SEPARATORS_AND_WHITESPACE = SEPARATORS + ' \t' @@ -930,6 +941,18 @@ class ParkLabCommonLicenseChecker(LicenseChecker): EXCEPTIONS = { + # The Bioconductor zlibbioc license is a permissive license. + # Ref: https://github.com/Bioconductor/zlibbioc/blob/devel/LICENSE + 'Custom: bioconductor-zlibbioc file LICENSE': [ + 'bioconductor-zlibbioc' + ], + + # The Bioconductor rsamtools license is an MIT license + # Ref: https://bioconductor.org/packages/release/bioc/licenses/Rsamtools/LICENSE + 'Custom: bioconductor-rsamtools file LICENSE': [ + 'bioconductor-rsamtools' + ], + # DFSG = Debian Free Software Guidelines # Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines # Used as an apparent modifier to other licenses, to say they are approved per Debian. @@ -939,6 +962,19 @@ class ParkLabCommonLicenseChecker(LicenseChecker): 'pytest-timeout', # MIT Licensed ], + 'FOSS': [ + # The r-stringi library is a conda library that implements a stringi (pronounced "stringy") library for R. + # The COnda source feed is: https://github.com/conda-forge/r-stringi-feedstock + # This page explains that the home source is https://stringi.gagolewski.com/ but that's a doc page. + # The doc page says: + # > stringi’s source code is hosted on GitHub. + # > It is distributed under the open source BSD-3-clause license. + # The source code has a license that begins with a BSD-3-clause license and includes numerous others, + # but they all appear to be permissive. + # Ref: https://github.com/gagolews/stringi/blob/master/LICENSE + 'stringi', 'r-stringi', + ], + # Linking = With Restrictions, Private Use = Yes # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ @@ -1321,24 +1357,38 @@ class C4PythonInfrastructureLicenseChecker(C4InfrastructureLicenseChecker): LICENSE_FRAMEWORKS = ['python'] -# Need to figure out if this shoul duse the regular pipeline checker of the GPL checker as the parent here. -# -# @LicenseCheckerRegistry.register_checker('scan2-pipeline') -# class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): -# """ -# Checker for SCAN2 library from Park Lab. -# """ -# -# EXCEPTIONS = augment( -# ParkLabGplPipelineLicenseChecker.EXCEPTIONS, -# by={ -# 'Custom: Matrix file LICENCE': [ -# # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE -# # says there are potential extra restrictions beyond a simple GPL license -# # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're -# # trusting that any other libraries used by Scan2 would have investigated this. -# # So, effectively, we think the Matrix library for this situation operates the -# # same as if it were just GPL-3 licensed, and we are fine with that. -# 'Matrix' -# ] -# }) +@LicenseCheckerRegistry.register_checker('scan2-pipeline') +class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): + """ + Checker for SCAN2 library from Park Lab. + """ + + EXCEPTIONS = augment( + ParkLabGplPipelineLicenseChecker.EXCEPTIONS, + by={ + 'Custom: Matrix file LICENCE': [ + # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE + # says there are potential extra restrictions beyond a simple GPL license + # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're + # trusting that any other libraries used by Scan2 would have investigated this. + # So, effectively, we think the Matrix library for this situation operates the + # same as if it were just GPL-3 licensed, and we are fine with that. + 'Matrix' + ], + + "MISSING": [ + # mysql-common and mysql-libs are GPL, but since they are delivered by conda + # and not distributed as part of the Scan2 distribution, they should be OK. + # Ref: https://redresscompliance.com/mysql-license-a-complete-guide-to-licensing/#:~:text=commercial%20use # noQA + 'mysql-common', + 'mysql-libs', + + # This is our own library + 'r-scan2', 'scan2', + ] + } + ) + + EXPECTED_MISSING_LICENSES = ParkLabGplPipelineLicenseChecker.EXPECTED_MISSING_LICENSES + [ + + ] diff --git a/pyproject.toml b/pyproject.toml index 8ff28816b..ffee79607 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0.1b3" +version = "7.11.0.1b4" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/test/test_license_utils.py b/test/test_license_utils.py index 36691b39f..01d898eff 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -14,7 +14,7 @@ LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, ) -from dcicutils.misc_utils import ignored, file_contents +from dcicutils.misc_utils import ignored, file_contents, local_attrs from dcicutils.qa_utils import printed_output, MockFileSystem from unittest import mock @@ -201,40 +201,41 @@ def check_implications(spec, implications): check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) -def test_javascript_license_framework_get_licenses(): - - print() # start on a fresh line - packages = {} - for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1): - package = f'package{i}' - packages[f"package{i}"] = { - "licenses": license, - "repository": f"https://github.com/dummy/{package}", - "publisher": f"J Dummy{i}", - "email": f"jdummy{i}@dummyhost.example.com", - "path": f"/some/path/to/package{i}", - "licenseFile": f"/some/path/to/package{i}/license" - } - subprocess_output = json.dumps(packages) - with mock.patch.object(subprocess_module, "check_output") as mock_check_output: - mock_check_output.return_value = subprocess_output - with printed_output() as printed: - assert JavascriptLicenseFramework.get_dependencies() == [ - {'framework': 'javascript', 'licenses': ['Apache-2.0'], 'name': 'package1'}, - {'framework': 'javascript', 'licenses': ['MIT'], 'name': 'package2'}, - {'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, - {'framework': 'javascript', 'licenses': [], 'name': 'package4'}, - ] - assert printed.lines == [ - "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']" - ] - - # A special case for missing data... - mock_check_output.return_value = "{}\n\n" - with pytest.raises(Exception) as esc: - # When no package data is available, {} gets returned, and we need to complain this is odd. - JavascriptLicenseFramework.get_dependencies() - assert str(esc.value) == "No javascript license data was found." +@pytest.mark.parametrize('verbose', [False, True]) +def test_javascript_license_framework_get_licenses(verbose): + + with local_attrs(LicenseFramework, REWRITE_VERBOSE=verbose): + print() # start on a fresh line + packages = {} + for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1): + package = f'package{i}' + packages[f"package{i}"] = { + "licenses": license, + "repository": f"https://github.com/dummy/{package}", + "publisher": f"J Dummy{i}", + "email": f"jdummy{i}@dummyhost.example.com", + "path": f"/some/path/to/package{i}", + "licenseFile": f"/some/path/to/package{i}/license" + } + subprocess_output = json.dumps(packages) + with mock.patch.object(subprocess_module, "check_output") as mock_check_output: + mock_check_output.return_value = subprocess_output + with printed_output() as printed: + assert JavascriptLicenseFramework.get_dependencies() == [ + {'framework': 'javascript', 'licenses': ['Apache-2.0'], 'name': 'package1'}, + {'framework': 'javascript', 'licenses': ['MIT'], 'name': 'package2'}, + {'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, + {'framework': 'javascript', 'licenses': [], 'name': 'package4'}, + ] + expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']" + assert printed.lines == ([expected_rewrite_description] if verbose else []) + + # A special case for missing data... + mock_check_output.return_value = "{}\n\n" + with pytest.raises(Exception) as esc: + # When no package data is available, {} gets returned, and we need to complain this is odd. + JavascriptLicenseFramework.get_dependencies() + assert str(esc.value) == "No javascript license data was found." def test_python_license_framework_piplicenses_args(): From e7a2c3c7dcae15fbbb5b7fe744870cba063fe0c3 Mon Sep 17 00:00:00 2001 From: Douglas Rioux Date: Tue, 19 Sep 2023 16:07:28 -0400 Subject: [PATCH 12/24] Add KMS key tooling --- dcicutils/glacier_utils.py | 20 ++++++++++++++++---- test/test_glacier_utils.py | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/dcicutils/glacier_utils.py b/dcicutils/glacier_utils.py index bbcf77893..7609ab316 100644 --- a/dcicutils/glacier_utils.py +++ b/dcicutils/glacier_utils.py @@ -58,6 +58,10 @@ def __init__(self, env_name: str): self.env_key = self.key_manager.get_keydict_for_env(env_name) self.health_page = get_health_page(key=self.env_key, ff_env=env_name) + @property + def kms_key_id(self) -> str: + return self.health_page.get("s3_encrypt_key_id", "") + @classmethod def is_glacier_storage_class(cls, storage_class: S3StorageClass): return storage_class in S3_GLACIER_CLASSES @@ -295,6 +299,9 @@ def _do_multipart_upload(self, bucket: str, key: str, total_size: int, part_size } if tags: cmu['Tagging'] = tags + if self.kms_key_id: + cmu['ServerSideEncryption'] = 'aws:kms' + cmu['SSEKMSKeyId'] = self.kms_key_id mpu = self.s3.create_multipart_upload(**cmu) mpu_upload_id = mpu['UploadId'] except Exception as e: @@ -381,16 +388,21 @@ def copy_object_back_to_original_location(self, bucket: str, key: str, storage_c else: # Force copy the object into standard in a single operation copy_source = {'Bucket': bucket, 'Key': key} - copy_target = { + copy_args = { 'Bucket': bucket, 'Key': key, 'StorageClass': storage_class, } if version_id: copy_source['VersionId'] = version_id - copy_target['CopySourceVersionId'] = version_id + copy_args['CopySourceVersionId'] = version_id if tags: - copy_target['Tagging'] = tags - response = self.s3.copy_object(CopySource=copy_source, **copy_target) + copy_args['Tagging'] = tags + if self.kms_key_id: + copy_args['ServerSideEncryption'] = 'aws:kms' + copy_args['SSEKMSKeyId'] = self.kms_key_id + response = self.s3.copy_object( + **copy_args, CopySource=copy_source + ) PRINT(f'Response from boto3 copy:\n{response}') PRINT(f'Object {bucket}/{key} copied back to its original location in S3') return response diff --git a/test/test_glacier_utils.py b/test/test_glacier_utils.py index e69bbc3cb..b44c50cd6 100644 --- a/test/test_glacier_utils.py +++ b/test/test_glacier_utils.py @@ -27,6 +27,7 @@ def mock_health_page() -> dict: 'file_upload_bucket': 'cgap-dummy-main-application-cgap-dummy-files', 'namespace': 'cgap-dummy', 'processed_file_bucket': 'cgap-dummy-main-application-cgap-dummy-wfoutput', + 's3_encrypt_key_id': 'dummy_kms_key', } From 9352f4fea03dbe7f8226b4cdbbc8985b4bbddf35 Mon Sep 17 00:00:00 2001 From: Douglas Rioux Date: Tue, 19 Sep 2023 16:08:24 -0400 Subject: [PATCH 13/24] Ignore Vi temp files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 141fe1474..0731c2579 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,7 @@ ENV/ # PyCharm metadata .idea/ + +# Vi +*.swp +*.swo From c3c4efdf60cc81c5102b9868f72f001ca91558db Mon Sep 17 00:00:00 2001 From: Douglas Rioux Date: Tue, 19 Sep 2023 16:10:26 -0400 Subject: [PATCH 14/24] Bump minor version + changelog --- CHANGELOG.rst | 9 +++++++++ pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 345c0a1d1..3031f9d23 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Change Log ---------- +7.12.0 +====== + +* In ``glacier_utils``: + + * Add functionality for KMS key encrypted accounts + + 7.11.0 ====== @@ -16,6 +24,7 @@ Change Log * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` returning webtest.response.TestResponse which has a ``json`` object property rather than a function. + 7.10.0 ====== diff --git a/pyproject.toml b/pyproject.toml index 65dba0353..10bf6a9ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0" +version = "7.12.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 7e8e1f7dfe081d204cfeebea756bd43933ef38a4 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 20 Sep 2023 12:56:34 -0400 Subject: [PATCH 15/24] Refactor for better control and to fix unit tests. --- dcicutils/license_utils.py | 111 ++++++++++++++++++++----------------- test/test_license_utils.py | 90 +++++++++++++++++++++++++++--- 2 files changed, 140 insertions(+), 61 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 14dd74ca1..d37e9a3f9 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -33,6 +33,7 @@ from dcicutils.lang_utils import there_are from dcicutils.misc_utils import ( PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool, + remove_suffix, ) T = TypeVar("T") @@ -65,13 +66,16 @@ class LicenseStatus: UNEXPECTED_MISSING = "UNEXPECTED_MISSING" -LICENSE_UTILS_VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=False) +class LicenseOptions: + # General verbosity, such as progress information + VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=True) + # Specific additional debugging output + DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False) class LicenseFramework: NAME = None - VERBOSE = LICENSE_UTILS_VERBOSE @classmethod def get_dependencies(cls): @@ -135,29 +139,6 @@ def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) -def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]: - # We only care which licenses were mentioned, not what algebra is used on them. - # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) - # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". - # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will - # have trouble passing unless both MIT and GPL-3.0 are allowed. - terms = sorted(map(lambda x: x.strip(), - (boolean_expression - .replace('(', '') - .replace(')', '') - .replace(' AND ', ',') - .replace(' and ', ',') - .replace(' & ', ',') - .replace(' OR ', ',') - .replace(' or ', ',') - .replace('|', ',') - .replace(';', ',') - .replace(' + ', ',') - .replace('file ', f'Custom: {for_package_name} file ') - ).split(','))) - return terms - - # This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' # It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. # It will also correctly handle the unlikely case of ' (= 3 or greater)' @@ -168,7 +149,7 @@ def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> Lis _GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') -def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose: bool = False) -> str: +def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: m = _GPL_VERSION_CHOICE.match(licenses_spec) if m: version_a, version_b = m.groups() @@ -207,18 +188,58 @@ def simplify_license_versions(licenses_spec: str, *, for_package_name, verbose: break matched = m.group(1) licenses_spec = licenses_spec.replace(matched, '+') - if verbose and licenses_spec != original_licenses_spec: - print(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + if LicenseOptions.DEBUG and licenses_spec != original_licenses_spec: + PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") return licenses_spec +def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]: + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + revised_boolean_expression = ( + boolean_expression + .replace('(', '') + .replace(')', '') + .replace(' AND ', ',') + .replace(' and ', ',') + .replace(' & ', ',') + .replace(' OR ', ',') + .replace(' or ', ',') + .replace('|', ',') + .replace(';', ',') + .replace(' + ', ',') + .replace('file ', f'Custom: {for_package_name} file ') + ) + terms = [x for x in sorted(map(lambda x: x.strip(), revised_boolean_expression.split(','))) if x] + if LicenseOptions.DEBUG and revised_boolean_expression != boolean_expression: + PRINT(f"Rewriting {boolean_expression!r} as {terms!r}.") + return terms + + @LicenseFrameworkRegistry.register_framework(name='javascript') class JavascriptLicenseFramework(LicenseFramework): @classmethod def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: ignored(package_name) - return extract_boolean_terms(licenses_spec, for_package_name=package_name) + licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) + licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) + return licenses + + VERSION_PATTERN = re.compile('^.+?([@][0-9.][^@]*|)$') + + @classmethod + def strip_version(cls, raw_name): + name = raw_name + m = cls.VERSION_PATTERN.match(raw_name) # e.g., @foo/bar@3.7 + if m: + suffix = m.group(1) + if suffix: + name = remove_suffix(m.group(1), name) + return name @classmethod def get_dependencies(cls): @@ -231,18 +252,13 @@ def get_dependencies(cls): # e.g., this happens if there's no javascript in the repo raise Exception("No javascript license data was found.") result = [] - for name, record in records.items(): - licenses_spec = record.get(_LICENSES) - if '(' in licenses_spec: - licenses = cls.implicated_licenses(package_name=name, licenses_spec=licenses_spec) - # print(f"Rewriting {licenses_spec!r} as {licenses!r}") - elif licenses_spec: - licenses = [licenses_spec] - else: - licenses = [] + for raw_name, record in records.items(): + name = cls.strip_version(raw_name) + raw_licenses_spec = record.get(_LICENSES) + licenses = cls.implicated_licenses(licenses_spec=raw_licenses_spec, package_name=name) entry = { - _NAME: name.lstrip('@').split('@')[0], # e.g., @foo/bar@3.7 - _LICENSES: licenses, # TODO: could parse this better. + _NAME: name, + _LICENSES: licenses, _FRAMEWORK: 'javascript' } result.append(entry) @@ -322,11 +338,6 @@ def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: return [cls.R_LANGUAGE_LICENSE_NAME] licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) - # licenses = sorted(map(lambda x: x.strip(), - # (licenses_spec - # .replace('|', ',') - # .replace('file ', f'Custom: {package_name} file ') - # ).split(','))) return licenses @classmethod @@ -367,7 +378,7 @@ def get_dependencies(cls): result.append(entry) except Exception as e: found_problems += 1 - if cls.VERBOSE: + if LicenseOptions.VERBOSE: PRINT(get_error_message(e)) if found_problems > 0: warnings.warn(there_are(found_problems, kind="problem", show=False, punctuate=True, tense='past')) @@ -376,8 +387,6 @@ def get_dependencies(cls): class LicenseFileParser: - VERBOSE = LICENSE_UTILS_VERBOSE - SEPARATORS = '-.,' SEPARATORS_AND_WHITESPACE = SEPARATORS + ' \t' COPYRIGHT_SYMBOL = '\u00a9' @@ -414,7 +423,7 @@ def parse_simple_license_file(cls, *, filename): lines = [] for i, line in enumerate(fp): line = line.strip(' \t\n\r') - if cls.VERBOSE: # pragma: no cover - this is just for debugging + if LicenseOptions.DEBUG: # pragma: no cover - this is just for debugging PRINT(str(i).rjust(3), line) m = cls.COPYRIGHT_LINE.match(line) if line[:1].isupper() else None if not m: @@ -506,8 +515,6 @@ class LicenseChecker: # Set this to True in subclasses if you want your organization's policy to be that you see # some visible proof of which licenses were checked. - VERBOSE = True - LICENSE_TITLE = None COPYRIGHT_OWNER = None LICENSE_FRAMEWORKS = None @@ -630,7 +637,7 @@ def analyze_license_dependencies_for_framework(cls, *, _LICENSES: license_names, _STATUS: status }) - if cls.VERBOSE: # pragma: no cover - this is just for debugging + if LicenseOptions.VERBOSE: # pragma: no cover - this is just for debugging PRINT(f"Checked {framework.NAME} {name}:" f" {'; '.join(license_names) if license_names else '---'} ({status})") diff --git a/test/test_license_utils.py b/test/test_license_utils.py index 01d898eff..bb87ce8d8 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -8,11 +8,12 @@ from collections import defaultdict from dcicutils.license_utils import ( - LicenseFrameworkRegistry, LicenseFramework, + LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, + extract_boolean_terms, simplify_license_versions, ) from dcicutils.misc_utils import ignored, file_contents, local_attrs from dcicutils.qa_utils import printed_output, MockFileSystem @@ -187,11 +188,82 @@ class DummyLicenseFramework1(LicenseFramework): LicenseFrameworkRegistry.find_framework(1) # noQA - arg is intentionally of wrong type for testing -def test_javascript_license_framework_implicated_licenses(): +def test_javascript_license_framework_strip_version(): + + print() # start on fresh line + + strip_version = JavascriptLicenseFramework.strip_version + + assert strip_version('') == '' + + assert strip_version('foo') == 'foo' + assert strip_version('foo@bar') == 'foo@bar' + + assert strip_version('foo@3') == 'foo' + assert strip_version('foo@3.1') == 'foo' + assert strip_version('foo@3.1.0') == 'foo' + assert strip_version('foo@3.1.0b3') == 'foo' + assert strip_version('foo@3.1-beta') == 'foo' + + assert strip_version("@foo-3.1-beta") == '@foo-3.1-beta' # we don't treat leading '@' as a version marker + assert strip_version('foo@.9') == 'foo' # we tolerate a leading dot even though it's probably bad form + assert strip_version('foo@beta-3.9') == 'foo@beta-3.9' # treating suffix as version here is farther than we'll go + + +@pytest.mark.parametrize('debug', [False, True]) +def test_simplify_license_versions(debug): + + def test_it(spec, expected): + with local_attrs(LicenseOptions, DEBUG=True): + with printed_output() as printed: + assert simplify_license_versions(spec, for_package_name='ignored') == expected + assert printed.last == f"Rewriting {spec!r} as {expected!r}." + + test_it('GPL (version 2)', 'GPL-2') + test_it('GPL (version 2.0)', 'GPL-2.0') + test_it('GPL (= 2.0)', 'GPL-2.0') + test_it('GPL (= 2.1)', 'GPL-2.1') + + test_it('GPL (>= 2)', 'GPL-2+') + test_it('GPL (>= 2.0)', 'GPL-2.0+') + test_it('GPL (version 2 or greater)', 'GPL-2+') + test_it('GPL (version 2 or later)', 'GPL-2+') + + +@pytest.mark.parametrize('debug', [False, True]) +def test_extract_boolean_terms(debug): + + print() # start on a blank line def check_implications(spec, implications): - assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored', - licenses_spec=spec) == implications + with local_attrs(LicenseOptions, DEBUG=debug): + with printed_output() as printed: + assert extract_boolean_terms(spec, for_package_name='ignored') == implications + assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else []) + + check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) + check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) + + check_implications(spec='(MIT OR Apache-2.0)', implications=['Apache-2.0', 'MIT']) + + check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) + + sample_package = 'some-package' + assert extract_boolean_terms('MIT or file FOO', for_package_name=sample_package) == [ + f'Custom: {sample_package} file FOO', + 'MIT', + ] + + +@pytest.mark.parametrize('debug', [False, True]) +def test_javascript_license_framework_implicated_licenses(debug): + + def check_implications(spec, implications): + with local_attrs(LicenseOptions, DEBUG=debug): + with printed_output() as printed: + assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored', + licenses_spec=spec) == implications + assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else []) check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) @@ -201,10 +273,10 @@ def check_implications(spec, implications): check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) -@pytest.mark.parametrize('verbose', [False, True]) -def test_javascript_license_framework_get_licenses(verbose): +@pytest.mark.parametrize('debug', [False, True]) +def test_javascript_license_framework_get_licenses(debug): - with local_attrs(LicenseFramework, REWRITE_VERBOSE=verbose): + with local_attrs(LicenseOptions, DEBUG=debug): print() # start on a fresh line packages = {} for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1): @@ -227,8 +299,8 @@ def test_javascript_license_framework_get_licenses(verbose): {'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, {'framework': 'javascript', 'licenses': [], 'name': 'package4'}, ] - expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']" - assert printed.lines == ([expected_rewrite_description] if verbose else []) + expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']." + assert printed.lines == ([expected_rewrite_description] if debug else []) # A special case for missing data... mock_check_output.return_value = "{}\n\n" From 637dff67623c903588505de63e817079c2e0428a Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 20 Sep 2023 14:00:05 -0400 Subject: [PATCH 16/24] Allow --brief, --debug, and --conda-prefix to be specified on command line of run-license-checker. --- dcicutils/license_utils.py | 16 +++++++++++++--- dcicutils/scripts/run_license_checker.py | 19 +++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index d37e9a3f9..b14ffa82e 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -71,6 +71,16 @@ class LicenseOptions: VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=True) # Specific additional debugging output DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False) + CONDA_PREFIX = os.environ.get("CONDA_LICENSE_CHECKER_PREFIX", os.environ.get("CONDA_PREFIX", "")) + + @classmethod + @contextlib.contextmanager + def selected_options(cls, verbose=VERBOSE, debug=DEBUG, conda_prefix=CONDA_PREFIX): + """ + Allows a script, for example, to specify overrides for these options dynamically. + """ + with local_attrs(cls, VERBOSE=verbose, DEBUG=debug, CONDA_PREFIX=conda_prefix): + yield class LicenseFramework: @@ -296,7 +306,7 @@ class CondaLicenseFramework(LicenseFramework): @classmethod def get_dependencies(cls): - prefix = os.environ.get("CONDA_LICENSE_CHECKER_PREFIX", os.environ.get("CONDA_PREFIX", "")) + prefix = LicenseOptions.CONDA_PREFIX result = [] filespec = os.path.join(prefix, "conda-meta/*.json") files = glob.glob(filespec) @@ -423,8 +433,8 @@ def parse_simple_license_file(cls, *, filename): lines = [] for i, line in enumerate(fp): line = line.strip(' \t\n\r') - if LicenseOptions.DEBUG: # pragma: no cover - this is just for debugging - PRINT(str(i).rjust(3), line) + # if LicenseOptions.DEBUG: # pragma: no cover - this is just for debugging + # PRINT(str(i).rjust(3), line) m = cls.COPYRIGHT_LINE.match(line) if line[:1].isupper() else None if not m: lines.append(line) diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py index 33ff4b90c..f2324c0cf 100644 --- a/dcicutils/scripts/run_license_checker.py +++ b/dcicutils/scripts/run_license_checker.py @@ -2,7 +2,7 @@ from dcicutils.command_utils import script_catch_errors, ScriptFailure from dcicutils.lang_utils import there_are, conjoined_list -from dcicutils.license_utils import LicenseCheckerRegistry, LicenseChecker, LicenseCheckFailure +from dcicutils.license_utils import LicenseOptions, LicenseCheckerRegistry, LicenseChecker, LicenseCheckFailure from dcicutils.misc_utils import PRINT, get_error_message from typing import Optional, Type @@ -25,10 +25,17 @@ def main(): help=f"The name of a checker to run. " + there_are(ALL_CHECKER_NAMES, kind='available checker', show=True, joiner=conjoined_list, punctuate=True)) + parser.add_argument("--brief", '-b', default=False, action="store_true", + help="Requests brief output.") + parser.add_argument("--debug", '-q', default=False, action="store_true", + help="Requests additional debugging output.") + parser.add_argument("--conda-prefix", "--conda_prefix", "--cp", default=LicenseOptions.CONDA_PREFIX, + help=(f"Overrides the CONDA_PREFIX (default {LicenseOptions.CONDA_PREFIX!r}).")) + args = parser.parse_args() with script_catch_errors(): - run_license_checker(name=args.name) + run_license_checker(name=args.name, verbose=not args.brief, debug=args.debug, conda_prefix=args.conda_prefix) def show_help_for_choosing_license_checker(): @@ -52,7 +59,10 @@ def show_help_for_choosing_license_checker(): PRINT("") -def run_license_checker(name: Optional[str]): +def run_license_checker(name: Optional[str], + verbose=LicenseOptions.VERBOSE, + debug=LicenseOptions.DEBUG, + conda_prefix=LicenseOptions.CONDA_PREFIX): if name is None: show_help_for_choosing_license_checker() else: @@ -61,6 +71,7 @@ def run_license_checker(name: Optional[str]): except Exception as e: raise ScriptFailure(str(e)) try: - checker_class.validate() + with LicenseOptions.selected_options(verbose=verbose, debug=debug, conda_prefix=conda_prefix): + checker_class.validate() except LicenseCheckFailure as e: raise ScriptFailure(get_error_message(e)) From a5674b8c46c1f82ef7a7b4b5d9d14070b8181a83 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Wed, 20 Sep 2023 14:11:28 -0400 Subject: [PATCH 17/24] Remove some debugging code. --- dcicutils/license_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index b14ffa82e..f0717b50e 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -433,8 +433,6 @@ def parse_simple_license_file(cls, *, filename): lines = [] for i, line in enumerate(fp): line = line.strip(' \t\n\r') - # if LicenseOptions.DEBUG: # pragma: no cover - this is just for debugging - # PRINT(str(i).rjust(3), line) m = cls.COPYRIGHT_LINE.match(line) if line[:1].isupper() else None if not m: lines.append(line) From e6a6514bb427108788f07d4e2aac218af4de9016 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 08:01:08 -0400 Subject: [PATCH 18/24] Refactor to make things data-driven. --- dcicutils/common.py | 3 + .../license_policies/c4-infrastructure.jsonc | 8 + .../c4-python-infrastructure.jsonc | 8 + .../park-lab-common-server.jsonc | 104 +++ .../license_policies/park-lab-common.jsonc | 407 ++++++++ .../park-lab-gpl-pipeline.jsonc | 62 ++ .../license_policies/park-lab-pipeline.jsonc | 12 + dcicutils/license_utils.py | 865 +++++------------- dcicutils/misc_utils.py | 6 +- dcicutils/scripts/run_license_checker.py | 3 +- poetry.lock | 14 +- pyproject.toml | 1 + test/test_license_utils.py | 237 ++++- test/test_misc_utils.py | 3 +- 14 files changed, 1090 insertions(+), 643 deletions(-) create mode 100644 dcicutils/license_policies/c4-infrastructure.jsonc create mode 100644 dcicutils/license_policies/c4-python-infrastructure.jsonc create mode 100644 dcicutils/license_policies/park-lab-common-server.jsonc create mode 100644 dcicutils/license_policies/park-lab-common.jsonc create mode 100644 dcicutils/license_policies/park-lab-gpl-pipeline.jsonc create mode 100644 dcicutils/license_policies/park-lab-pipeline.jsonc diff --git a/dcicutils/common.py b/dcicutils/common.py index b4f487cf3..13d518455 100644 --- a/dcicutils/common.py +++ b/dcicutils/common.py @@ -1,4 +1,5 @@ import os +import re from typing import Dict, Union, Tuple, List, Any from typing_extensions import Literal @@ -36,6 +37,8 @@ LIBRARY_DIR = os.path.dirname(__file__) +Regexp = type(re.compile("sample")) + # ===== Auth Data ===== AuthStr = str diff --git a/dcicutils/license_policies/c4-infrastructure.jsonc b/dcicutils/license_policies/c4-infrastructure.jsonc new file mode 100644 index 000000000..7a77448f6 --- /dev/null +++ b/dcicutils/license_policies/c4-infrastructure.jsonc @@ -0,0 +1,8 @@ +{ + "class_key": "c4-infrastructure", + "class_name": "C4InfrastructureLicenseChecker", + "inherits_from": ["park-lab-common-server"], + "description": "Checker for C4 infrastructure (Fourfront, CGAP, SMaHT) from Park Lab.", + + "LICENSE_TITLE": "(The )?MIT License" +} diff --git a/dcicutils/license_policies/c4-python-infrastructure.jsonc b/dcicutils/license_policies/c4-python-infrastructure.jsonc new file mode 100644 index 000000000..12a4afcf2 --- /dev/null +++ b/dcicutils/license_policies/c4-python-infrastructure.jsonc @@ -0,0 +1,8 @@ +{ + "class_key": "c4-python-infrastructure", + "class_name": "C4PythonInfrastructureLicenseChecker", + "inherits_from": ["c4-infrastructure"], + "description": "Checker for C4 python library infrastructure (Fourfront, CGAP, SMaHT) from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python"] +} diff --git a/dcicutils/license_policies/park-lab-common-server.jsonc b/dcicutils/license_policies/park-lab-common-server.jsonc new file mode 100644 index 000000000..72c1af930 --- /dev/null +++ b/dcicutils/license_policies/park-lab-common-server.jsonc @@ -0,0 +1,104 @@ +{ + "class_key": "park-lab-common-server", + "inherits_from": ["park-lab-common"], + "description": "Minimal/generic checker for servers from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python", "javascript"], + + "EXCEPTIONS": { + "BSD*": [ + // Although modified to insert the author name into the license text itself, + // the license for these libraries are essentially BSD-3-Clause. + "formatio", + "samsam", + + // There are some slightly different versions of what appear to be BSD licenses here, + // but clearly the license is permissive. + // Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme + "mutation-observer" + ], + "Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global": [ + // The use of this URL appears to be a syntax error in the definition of entries-ponyfill + // In fact this seems to be covered by a CC0-1.0 license. + // Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE + "object.entries-ponyfill" + ], + "Custom: https://github.com/saikocat/colorbrewer.": [ + // The use of this URL appears to be a syntax error in the definition of cartocolor + // In fact, this seems to be covered by a CC-BY-3.0 license. + // Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme + "cartocolor" + ], + "Custom: https://travis-ci.org/component/emitter.png": [ + // The use of this png appears to be a syntax error in the definition of emitter-component. + // In fact, emitter-component uses an MIT License + // Ref: https://www.npmjs.com/package/emitter-component + // Ref: https://github.com/component/emitter/blob/master/LICENSE + "emitter-component" + ], + "Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg": [ + // The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) + // seems to lack a license, but appears to be forked from the jsts library that uses + // the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive + // license is intended. + "turf-jsts" + ], + "GNU General Public License (GPL)": [ + "docutils" // Used only privately as a separate documentation-generation task for ReadTheDocs + ], + "GNU Library or Lesser General Public License (LGPL)": [ + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + // "GNU Lesser General Public License v3 or later (LGPLv3+)", + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "psycopg2", // Used at runtime during server operation, but not modified or distributed + "psycopg2-binary", // Used at runtime during server operation, but not modified or distributed + "chardet", // Potentially used downstream in loadxl to detect charset for text files + "pyzmq" // Used in post-deploy-perf-tests, not distributed, and not modified or distributed + ], + "GPL-2.0": [ + // The license file for the node-forge javascript library says: + // + // "You may use the Forge project under the terms of either the BSD License or the + // GNU General Public License (GPL) Version 2." + // + // (We choose to use it under the BSD license.) + // Ref: https://www.npmjs.com/package/node-forge?activeTab=code + "node-forge" + ], + "MIT*": [ + // This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. + // (It also mentions that some tools for building/testing use other libraries.) + // Ref: https://github.com/requirejs/domReady/blob/master/LICENSE + "domready", + + // This library is under "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1" + // Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt + // About CDDL ... + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "jsonp", + + // This library says pretty clearly it intends MIT license. + // Ref: https://www.npmjs.com/package/component-indexof + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "component-indexof", + + // These look like a pretty straight MIT license. + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "mixin", // LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code + "stack-trace", // https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE + "typed-function" // LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code + ], + "UNLICENSED": [ + // The udn-browser library is our own and has been observed to sometimes show up in some contexts + // as UNLICENSED, when really it is MIT. + // Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE + "udn-browser" + ] + } +} diff --git a/dcicutils/license_policies/park-lab-common.jsonc b/dcicutils/license_policies/park-lab-common.jsonc new file mode 100644 index 000000000..e59d67aee --- /dev/null +++ b/dcicutils/license_policies/park-lab-common.jsonc @@ -0,0 +1,407 @@ +{ + "class_key": "park-lab-common", + "class_name": "ParkLabCommonLicenseChecker", + "inherits_from": [], + "description": "Minimal/generic checker common to all tech from Park Lab.", + + "COPYRIGHT_OWNER": "President and Fellows of Harvard College", + + "LICENSE_FRAMEWORKS": "ALL", + + "ALLOWED": [ + + // <> + // Ref: https://opensource.org/license/0bsd/ + "0BSD", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Academic Free License (AFL)", + "AFL-2.1", + + // Linking = Permissive, Private Use = Yes + // Apache licenses before version 2.0 are controversial, but we here construe an unmarked naming to imply + // any version, and hence v2. + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Apache Software License", + "Apache-Style", + {"pattern": "Apache([- ]2([.]0)?)?([- ]Licen[cs]e)?([- ]with[- ]LLVM[- ]exception)?"}, + // "Apache-2.0", + + // Artistic License 1.0 was confusing to people, so its status as permissive is in general uncertain, + // however the issue seems to revolve around point 8 (relating to whether or not perl is deliberately + // exposed). That isn't in play for our uses, so we don't flag it here. + // Artistic license 2.0 is a permissive license. + // Ref: https://en.wikipedia.org/wiki/Artistic_License + "Artistic-1.0-Perl", + {"pattern": "Artistic[- ]2([.]0)?"}, + + // According to Wikipedia, the Boost is considered permissive and BSD-like. + // Refs: + // * + // * https://en.wikipedia.org/wiki/Boost_(C%2B%2B_libraries)#License + {"pattern": "(BSL|Boost(([- ]Software)?[- ]License)?)([- ]1([.]0)?)?"}, + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + {"pattern": "((modified[- ])?[234][- ]Clause[- ])?BSD([- ][234][- ]Clause)?( Licen[cs]e)?"}, + // "BSD License", + // "BSD-2-Clause", + // "BSD-3-Clause", + // "BSD 3-Clause", + + // BZIP2 is a permissive license + // Ref: https://github.com/asimonov-im/bzip2/blob/master/LICENSE + {"pattern": "bzip2(-1[.0-9]*)"}, + + // Linking = Public Domain, Private Use = Public Domain + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CC0", + "CC0-1.0", + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CC-BY", + "CC-BY-3.0", + "CC-BY-4.0", + + // The curl license is a permissive license. + // Ref: https://curl.se/docs/copyright.html + "curl", + + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CDDL", + + // The original Eclipse Distribution License 1.0 is essentially a BSD-3-Clause license. + // Ref: https://www.eclipse.org/org/documents/edl-v10.php + "Eclipse Distribution License", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Eclipse Public License", + "EPL-2.0", + + // The FSF Unlimited License (FSFUL) seems to be a completely permissive license. + // Refs: + // * https://spdx.org/licenses/FSFUL.html + // * https://fedoraproject.org/wiki/Licensing/FSF_Unlimited_License + "FSF Unlimited License", + "FSFUL", + + // The FreeType license is a permissive license. + // Ref: LicenseRef-FreeType + {"pattern": "(Licen[cs]eRef-)?(FTL|FreeType( Licen[cs]e)?)"}, + + // Linking = Yes, Cat = Permissive Software Licenses + // Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer + "Historical Permission Notice and Disclaimer (HPND)", + "HPND", + {"pattern": "(Licen[cs]eRef-)?PIL"}, + // The Pillow or Python Image Library is an HPND license, which is a simple permissive license: + // Refs: + // * https://github.com/python-pillow/Pillow/blob/main/LICENSE + // * https://www.fsf.org/blogs/licensing/historical-permission-notice-and-disclaimer-added-to-license-list + + // The IJG license, used by Independent JPEG Group (IJG) is a custom permissive license. + // Refs: + // * https://en.wikipedia.org/wiki/Libjpeg + // * https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/LICENSE.md + "IJG", + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "ISC License (ISCL)", + "ISC", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "MIT License", + "MIT", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Mozilla Public License 2.0 (MPL 2.0)", + "MPL-1.1", + "MPL-2.0", + + // The SIL Open Font License appears to be a copyleft-style license that applies narrowly + // to icons and not to the entire codebase. It is advertised as OK for use even in commercial + // applications. + // Ref: https://fontawesome.com/license/free + "OFL-1.1", + + // Ref: https://en.wikipedia.org/wiki/Public_domain + {"pattern": "(Licen[cs]eRef-)?Public[- ]Domain([- ]dedic[t]?ation)?"}, // "dedictation" is a typo in docutils + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + {"pattern": "(Licen[cs]eRef-)?PSF-2([.][.0-9]*)"}, + "Python Software Foundation License", + "Python-2.0", + + // License = BSD-like + // Ref: https://en.wikipedia.org/wiki/Pylons_project + "Repoze Public License", + + // The TCL or Tcl/Tk licenses are permissive licenses. + // Ref: https://www.tcl.tk/software/tcltk/license.html + // The one used by the tktable library has a "bourbon" clause that doesn't add compliance requirements + // Ref: https://github.com/wjoye/tktable/blob/master/license.txt + {"pattern": "Tcl([/]tk)?"}, + + // The Ubuntu Font Licence is mostly permissive. It contains some restrictions if you are going to modify the + // fonts that require you to change the name to avoid confusion. But for our purposes, we're assuming that's + // not done, and so we're not flagging it. + {"pattern": "Ubuntu Font Licen[cs]e Version( 1([.]0)?)?"}, + + // Linking = Permissive/Public domain, Private Use = Permissive/Public domain + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "The Unlicense (Unlicense)", + "Unlicense", + + // Various licenses seem to call themselves or be summed up as unlimited. + // So far we know of none that are not highly permissive. + // * boot and KernSmooth are reported by R as being "Unlimited" + // Refs: + // * https://cran.r-project.org/web/packages/KernSmooth/index.html + // (https://github.com/cran/KernSmooth/blob/master/LICENCE.note) + // * https://cran.r-project.org/package=boot + // (https://github.com/cran/boot/blob/master/DESCRIPTION) + "Unlimited", + + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "W3C License", + "W3C-20150513", + + // Linking = Permissive/Public Domain, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "WTFPL", + + // Copyleft = No + // Ref: https://en.wikipedia.org/wiki/Zlib_License + // Linking = Permissive, Private Use = ? (for zlib/libpng license) + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Zlib", + + // Copyleft = No, FSF/OSI-approved: Yes + // Ref: https://en.wikipedia.org/wiki/Zope_Public_License + "Zope Public License" + ], + + "EXCEPTIONS": { + + // The Bioconductor zlibbioc license is a permissive license. + // Ref: https://github.com/Bioconductor/zlibbioc/blob/devel/LICENSE + "Custom: bioconductor-zlibbioc file LICENSE": [ + "bioconductor-zlibbioc" + ], + + // The Bioconductor rsamtools license is an MIT license + // Ref: https://bioconductor.org/packages/release/bioc/licenses/Rsamtools/LICENSE + "Custom: bioconductor-rsamtools file LICENSE": [ + "bioconductor-rsamtools" + ], + + // DFSG = Debian Free Software Guidelines + // Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines + // Used as an apparent modifier to other licenses, to say they are approved per Debian. + // For example in this case, pytest-timeout has license: DFSG approved, MIT License, + // but is really just an MIT License that someone has checked is DFSG approved. + "DFSG approved": [ + "pytest-timeout" // MIT Licensed + ], + + "FOSS": [ + // The r-stringi library is a conda library that implements a stringi (pronounced "stringy") library for R. + // The COnda source feed is: https://github.com/conda-forge/r-stringi-feedstock + // This page explains that the home source is https://stringi.gagolewski.com/ but that's a doc page. + // The doc page says: + // > stringi’s source code is hosted on GitHub. + // > It is distributed under the open source BSD-3-clause license. + // The source code has a license that begins with a BSD-3-clause license and includes numerous others, + // but they all appear to be permissive. + // Ref: https://github.com/gagolews/stringi/blob/master/LICENSE + "stringi", + "r-stringi" + ], + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GNU Lesser General Public License v2 or later (LGPLv2+)": [ + "chardet" // used at runtime during server operation (ingestion), but not modified or distributed + ], + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GNU Lesser General Public License v3 or later (LGPLv3+)": [ + // used only privately in testing, not used in server code, not modified, not distributed + "pytest-redis", + // required by pytest-redis (used only where it's used) + "mirakuru" + ], + + "GNU General Public License (GPL)": [ + "docutils" // Used only privately as a separate documentation-generation task for ReadTheDocs + ], + + "MIT/X11 Derivative": [ + // The license used by libxkbcommon is complicated and involves numerous included licenses, + // but all are permissive. + // Ref: https://github.com/xkbcommon/libxkbcommon/blob/master/LICENSE + "libxkbcommon" + ], + + "None": [ + // It's not obvious why Conda shows this license as "None". + // In fact, though, BSD 3-Clause "New" or "Revised" License + // Ref: https://github.com/AnacondaRecipes/_libgcc_mutex-feedstock/blob/master/LICENSE.txt + "_libgcc_mutex" + ], + + "PostgreSQL": [ + // The libpq library is actually licensed with a permissive BSD 3-Clause "New" or "Revised" License + // Ref: https://github.com/lpsmith/postgresql-libpq/blob/master/LICENSE + "libpq" + ], + + "UCSD": [ + // It isn't obvious why these show up with a UCSD license in Conda. + // The actual sources say it should be a 2-clause BSD license: + // Refs: + // * https://github.com/AlexandrovLab/SigProfilerMatrixGenerator/blob/master/LICENSE + // * https://github.com/AlexandrovLab/SigProfilerPlotting/blob/master/LICENSE + "sigprofilermatrixgenerator", + "sigprofilerplotting" + ], + + "X11": [ + // The ncurses library has a VERY complicated history, BUT seems consistently permissive + // and the most recent version seems to be essentially the MIT license. + // Refs: + // * https://en.wikipedia.org/wiki/Ncurses#License + // * https://invisible-island.net/ncurses/ncurses-license.html + "ncurses" + ], + + "zlib-acknowledgement": [ + // It isn't clear whey libpng shows up with this license name, but the license for libpng + // is a permissive license. + // Ref: https://github.com/glennrp/libpng/blob/libpng16/LICENSE + "libpng" + ] + }, + + "EXPECTED_MISSING_LICENSES": [ + + // This is a name we use for our C4 portals. And it isn't published. + // We inherited the name from the Stanford ENCODE group, which had an MIT-licensed repo we forked + "encoded", // cgap-portal, fourfront, and smaht-portal all call themselves this + + // We believe that since these next here are part of the Pylons project, they're covered under + // the same license as the other Pylons projects. We're seeking clarification. + "pyramid-translogger", + "subprocess-middleware", + + // This appears to be a BSD 2-Clause "Simplified" License, according to GitHub. + // PyPi also says it's a BSD license. + // Ref: https://github.com/paulc/dnslib/blob/master/LICENSE + "dnslib", + + // This says it wants an ISC License, which we already have approval for but just isn't showing up. + // Ref: https://github.com/rthalley/dnspython/blob/master/LICENSE + "dnspython", + + // This appears to be a mostly-MIT-style license. + // There are references to parts being in the public domain, though it's not obvious if that's meaningful. + // It's probably sufficient for our purposes to treat this as a permissive license. + // Ref: https://github.com/tlsfuzzer/python-ecdsa/blob/master/LICENSE + "ecdsa", + + // This has an MIT license in its source repository + // Ref: https://github.com/xlwings/jsondiff/blob/master/LICENSE + "jsondiff", + + // This has an MIT license in its source repository + // Ref: https://github.com/pkerpedjiev/negspy/blob/master/LICENSE + "negspy", + + // This license statement is complicated, but seems adequately permissive. + // Ref: https://foss.heptapod.net/python-libs/passlib/-/blob/branch/stable/LICENSE + "passlib", + + // This seems to be a BSD-3-Clause license. + // Ref: https://github.com/protocolbuffers/protobuf/blob/main/LICENSE + // pypi agrees in the Meta section of protobuf's page, where it says "3-Clause BSD License" + // Ref: https://pypi.org/project/protobuf/ + "protobuf", + + // The WTFPL license is permissive. + // Ref: https://github.com/mk-fg/pretty-yaml/blob/master/COPYING + "pyaml", + + // This uses a BSD license + // Ref: https://github.com/eliben/pycparser/blob/master/LICENSE + "pycparser", + + // The source repo for pyDes says this is under an MIT license + // Ref: https://github.com/twhiteman/pyDes/blob/master/LICENSE.txt + // pypi, probably wrongly, thinks this is in the public domain (as of 2023-07-21) + // Ref: https://pypi.org/project/pyDes/ + "pyDes", + + // This uses an MIT license + // Ref: https://github.com/pysam-developers/pysam/blob/master/COPYING + "pysam", + + // The version of python-lambda that we forked calls itself this (and publishes at pypi under this name) + "python-lambda-4dn", + + // This is MIT-licensed: + // Ref: https://github.com/themiurgo/ratelim/blob/master/LICENSE + // pypi agrees + // Ref: https://pypi.org/project/ratelim/ + "ratelim", + + // This is a BSD-3-Clause-Modification license + // Ref: https://github.com/repoze/repoze.debug/blob/master/LICENSE.txt + "repoze.debug", + + // This is an Apache-2.0 license + // Ref: https://github.com/getsentry/responses/blob/master/LICENSE + "responses", + + // This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault + // calls itself internally. In any case, it's under MIT license and OK. + // Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt + "snovault", + + // PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" + // Ref: https://pypi.org/project/supervisor/ + // In fact, though, the license is a bit more complicated, though apparently still permissive. + // Ref: https://github.com/Supervisor/supervisor/blob/main/LICENSES.txt + "supervisor", + + // This seems to be a BSD-3-Clause-Modification license. + // Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt + "translationstring", + + // This seems to be a BSD-3-Clause-Modification license. + // Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt + "venusian", + + // PyPi identifies zope.deprecation as using the "Zope Public License (ZPL 2.1)" license. + // Ref: https://github.com/zopefoundation/Zope/blob/master/LICENSE.txt + "zope.deprecation" + + // Below are licenses last known to have licenses missing in pip-licenses and need to be investigated further. + // Note well that just because pip-licenses doesn't know the license doesn't mean the software has + // no license. It may just mean the library is poorly registered in pypi. Some licenses have to be + // found by looking at the library's documentation or source files. + + // (all of these have been classified at this point) + ] +} diff --git a/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc b/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc new file mode 100644 index 000000000..1ff0b2723 --- /dev/null +++ b/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc @@ -0,0 +1,62 @@ +{ + "class_key": "park-lab-gpl-pipeline", + "class_name": "ParkLabGplPipelineLicenseChecker", + "inherits_from": ["park-lab-pipeline"], + "description": "Minimal/generic checker for GPL-approved pipelines from Park Lab.", + + "ALLOWED": [ + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + // The "exceptions", if present, indicate waivers to source delivery requirements. + // Ref: https://spdx.org/licenses/LGPL-3.0-linking-exception.html + {"pattern": "GNU Lesser General Public License v2( or later)?( [(]LGPL[v]?[23][+]?[)])?"}, + // "GNU Lesser General Public License v2 or later (LGPLv2+)", + // "GNU Lesser General Public License v3 or later (LGPLv3+)", + // "LGPLv2", "LGPL-v2", "LGPL-v2.0", "LGPL-2", "LGPL-2.0", + // "LGPLv2+", "LGPL-v2+", "LGPL-v2.0+", "LGPL-2+", "LGPL-2.0+", + // "LGPLv3", "LGPL-v3", "LGPL-v3.0", "LGPL-3", "LGPL-3.0", + // "LGPLv3+", "LGPL-v3+", "LGPL-v3.0+", "LGPL-3+", "LGPL-3.0+", + {"pattern": "LGPL[v-]?[.0-9]*([+]|-only)?([- ]with[- ]exceptions)?"}, + + // Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK + // for pipeline or server use as long as we"re not distributing sources. + "LGPL", + "GNU Library or Lesser General Public License (LGPL)", + + // GPL + // * library exception operates like LGPL + // * classpath exception is a linking exception related to Oracle + // Refs: + // * https://www.gnu.org/licenses/old-licenses/gpl-1.0.en.html + // * https://spdx.org/licenses/GPL-2.0-with-GCC-exception.html + // * https://spdx.org/licenses/GPL-3.0-with-GCC-exception.html + { + "pattern": [ + "(GNU General Public License|GPL)[ ]?[v-]?[123]([.]0)?([+]|[- ]only)?", + "([- ]with[- ]GCC(([- ]runtime)?[- ]library)?[- ]exception([- ][.0-9]*)?)?", + "([- ]with[- ]Classpath[- ]exception([- ][.0-9]+)?)?" + ] + }, + + // Linking = "GPLv3 compatible only", Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GPL-2-or-3", // we sometimes generate this token + // "GPLv2+", "GPL-v2+", "GPL-v2.0+", "GPL-2+", "GPL-2.0+", + // "GPLv3", "GPL-v3", "GPL-v3.0", "GPL-3", "GPL-3.0", + // "GPLv3+", "GPL-v3+", "GPL-v3.0+", "GPL-3+", "GPL-3.0+", + // "GPLv3-only", "GPL-3-only", "GPL-v3-only", "GPL-3.0-only", "GPL-v3.0-only", + + // Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. + // And version 3 is our preferred interpretation. + "GNU General Public License", + "GPL", + + // This is an arbitrary catch-all name we made up because the R language some things identify themselves + // as a specific part of the R language + // Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Legalese + // An important clarification to this is here: + // Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Can-I-use-R-for-commercial-purposes_003f + "R-language-license" + ] +} diff --git a/dcicutils/license_policies/park-lab-pipeline.jsonc b/dcicutils/license_policies/park-lab-pipeline.jsonc new file mode 100644 index 000000000..5fbcc6616 --- /dev/null +++ b/dcicutils/license_policies/park-lab-pipeline.jsonc @@ -0,0 +1,12 @@ +{ + "class_key": "park-lab-pipeline", + "class_name": "ParkLabPipelineLicenseChecker", + "inherits_from": ["park-lab-common"], + "description": "Minimal/generic checker for non-GPL-approved pipelines from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python", "conda", "r"] +} + + + + diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index f0717b50e..93abc42be 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -4,7 +4,6 @@ import glob import io import json -# import logging import os import re import subprocess @@ -25,15 +24,17 @@ # import piplicenses from collections import defaultdict +from jsonc_parser.parser import JsoncParser from typing import Any, Dict, DefaultDict, List, Optional, Type, TypeVar, Union # For obscure reasons related to how this file is used for early prototyping, these must use absolute references # to modules, not relative references. Later when things are better installed, we can make refs relative again. +from dcicutils.common import Regexp, AnyJsonData from dcicutils.exceptions import InvalidParameterError -from dcicutils.lang_utils import there_are +from dcicutils.lang_utils import there_are, conjoined_list from dcicutils.misc_utils import ( PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool, - remove_suffix, + remove_suffix, to_camel_case ) T = TypeVar("T") @@ -49,6 +50,10 @@ _NAME = 'name' _STATUS = 'status' +_INHERITS_FROM = 'inherits_from' +_ALLOWED = 'allowed' +_EXCEPT = 'except' + def pattern(x): return re.compile(x, re.IGNORECASE) @@ -148,6 +153,10 @@ def find_framework(cls, framework_spec: FrameworkSpec): def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) + @classmethod + def all_framework_names(cls): + return sorted(cls.LICENSE_FRAMEWORKS.keys()) + # This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' # It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. @@ -315,13 +324,10 @@ def get_dependencies(cls): package_name = data['name'] package_license = data.get('license') or "MISSING" if package_license: - # print(f"package_license={package_license}") simplified_package_license_spec = simplify_license_versions(package_license, for_package_name=package_name) - # print(f" =simplified_package_license_spec => {simplified_package_license_spec}") package_licenses = extract_boolean_terms(simplified_package_license_spec, for_package_name=package_name) - # print(f"=> {package_licenses}") else: package_licenses = [] entry = { @@ -331,8 +337,6 @@ def get_dependencies(cls): } result.append(entry) result.sort(key=lambda x: x['name']) - # print(f"conda get_dependencies result={json.dumps(result, indent=2)}") - # print("conda deps = ", json.dumps(result, indent=2)) return result @@ -498,16 +502,16 @@ class LicenseChecker: COPYRIGHT_OWNER is the name of the copyright owner. - FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], but can be limited to + LICENSE_FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], but can be limited to just ['python'] for example. It doesn't make a lot of sense to limit it to ['javascript'], though you could, since you are using a Python library to do this, and it probably needs to have its dependencies checked. ALLOWED is a list of license names as returned by the pip-licenses library. - EXPECTED_MISSING is a list of libraries that are expected to have no license information. This is so you don't - have to get warning fatigue by seeing a warning over and over for things you know about. If a new library - with no license info shows up that you don't expect, you should investigate it, make sure it's OK, - and then add it to this list. + EXPECTED_MISSING_LICENSES is a list of libraries that are expected to have no license information. + This is so you don't have to get warning fatigue by seeing a warning over and over for things you know about. + If a new library with no license info shows up that you don't expect, you should investigate it, + make sure it's OK, and then add it to this list. EXCEPTIONS is a table (a dict) keyed on license names with entries that are lists of library names that are allowed to use the indicated license even though the license might not be generally allowed. This should be @@ -673,6 +677,7 @@ def analyze_license_dependencies_by_framework(cls, *, @classmethod def show_unacceptable_licenses(cls, *, analysis: LicenseAnalysis) -> LicenseAnalysis: if analysis.unacceptable: + # This is part of the essential output, so is not conditional on switches. PRINT(there_are(analysis.unacceptable, kind="unacceptable license", show=False, punctuation_mark=':')) for license, names in sorted(analysis.unacceptable.items()): PRINT(f" {license}: {', '.join(names)}") @@ -726,10 +731,14 @@ def _register(license_checker_class: Type[LicenseChecker]): return _register @classmethod - def lookup_checker(cls, name: str) -> Type[LicenseChecker]: - result: Optional[Type[LicenseChecker]] = cls.REGISTRY.get(name) + def find_checker(cls, checker_name: str) -> Optional[Type[LicenseChecker]]: + return cls.REGISTRY.get(checker_name, None) + + @classmethod + def lookup_checker(cls, checker_name: str) -> Type[LicenseChecker]: + result: Optional[Type[LicenseChecker]] = cls.find_checker(checker_name) if result is None: - raise InvalidParameterError(parameter='checker_name', value=name, + raise InvalidParameterError(parameter='checker_name', value=checker_name, options=cls.all_checker_names()) return result @@ -762,648 +771,230 @@ def __init__(self, message=None, unacceptable_licenses=None): super().__init__(message=message) -@LicenseCheckerRegistry.register_checker('park-lab-common') -class ParkLabCommonLicenseChecker(LicenseChecker): +def literal_string_or_regexp_from_dict(item): """ - Minimal checker common to all tech from Park Lab. + Expects either a string (which will be matched using ordinary equality) ore a regular expression, + expressed as a dictionary of the form {"pattern": , "flags": [, ...]} + The pattern is required. The flags may be omitted if null. + A pattern is either a string or a list of strings. If it is a list of strings, it will be concatenated + into a single string, which can be useful for breaking long strings over lines. + Flags are string names of re.WHATEVER flags that would be given to Python's re.compile. + UNICODE and IGNORECASE are on by default. """ + if isinstance(item, str): + return item + elif not isinstance(item, dict): + raise ValueError(f'Expected a string or a dictionary describing a regular expression.') + pattern = item.get('pattern') + # The pattern is permitted to be a string or list of strings, since in a JSON-style file we can't + # do the thing we do in python where we just juxtapose several strings, separated by whitespace + # and/or newlines, in order to have them taken as a single literal string. -kmp 29-Sep-2023 + if isinstance(pattern, str): + pass + elif isinstance(pattern, list): + pattern = ''.join(pattern) + else: + raise ValueError(f"Invalid pattern expression: {item!r}") + flags = item.get('flags') or [] + compilation_flags = re.IGNORECASE # UNICODE will default, but IGNORECASE we have to set up manually + for flag in flags: + if isinstance(flag, str) and flag.isupper(): + if hasattr(re, flag): + compilation_flags |= getattr(re, flag) + else: + raise ValueError(f"No such flag re.{flag}") + else: + raise ValueError(f"Flags must be strigs: {flag!r}") + regexp = re.compile(pattern, compilation_flags) + return regexp - COPYRIGHT_OWNER = "President and Fellows of Harvard College" - - ALLOWED = [ - - # <> - # Ref: https://opensource.org/license/0bsd/ - '0BSD', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Academic Free License (AFL)', - 'AFL-2.1', - - # Linking = Permissive, Private Use = Yes - # Apache licenses before version 2.0 are controversial, but we here construe an unmarked naming to imply - # any version, and hence v2. - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Apache Software License', - 'Apache-Style', - pattern("Apache([- ]2([.]0)?)?([- ]Licen[cs]e)?([- ]with[- ]LLVM[- ]exception)?"), - # 'Apache-2.0', - - # Artistic License 1.0 was confusing to people, so its status as permissive is in general uncertain, - # however the issue seems to revolve around point 8 (relating to whether or not perl is deliberately - # exposed). That isn't in play for our uses, so we don't flag it here. - # Artistic license 2.0 is a permissive license. - # Ref: https://en.wikipedia.org/wiki/Artistic_License - 'Artistic-1.0-Perl', - pattern('Artistic[- ]2([.]0)?'), - - # According to Wikipedia, the Boost is considered permissive and BSD-like. - # Refs: - # * - # * https://en.wikipedia.org/wiki/Boost_(C%2B%2B_libraries)#License - pattern('(BSL|Boost(([- ]Software)?[- ]License)?)([- ]1([.]0)?)?'), - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - pattern('((modified[- ])?[234][- ]Clause[- ])?BSD([- ][234][- ]Clause)?( Licen[cs]e)?'), - # 'BSD License', - # 'BSD-2-Clause', - # 'BSD-3-Clause', - # 'BSD 3-Clause', - - # BZIP2 is a permissive license - # Ref: https://github.com/asimonov-im/bzip2/blob/master/LICENSE - pattern('bzip2(-1[.0-9]*)'), - - # Linking = Public Domain, Private Use = Public Domain - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CC0', - 'CC0-1.0', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CC-BY', - 'CC-BY-3.0', - 'CC-BY-4.0', - - # The curl license is a permissive license. - # Ref: https://curl.se/docs/copyright.html - 'curl', - - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CDDL', - - # The original Eclipse Distribution License 1.0 is essentially a BSD-3-Clause license. - # Ref: https://www.eclipse.org/org/documents/edl-v10.php - 'Eclipse Distribution License', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Eclipse Public License', - 'EPL-2.0', - - # The FSF Unlimited License (FSFUL) seems to be a completely permissive license. - # Refs: - # * https://spdx.org/licenses/FSFUL.html - # * https://fedoraproject.org/wiki/Licensing/FSF_Unlimited_License - 'FSF Unlimited License', - 'FSFUL', - - # The FreeType license is a permissive license. - # Ref: LicenseRef-FreeType - pattern('(Licen[cs]eRef-)?(FTL|FreeType( Licen[cs]e)?)'), - - # Linking = Yes, Cat = Permissive Software Licenses - # Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer - 'Historical Permission Notice and Disclaimer (HPND)', - 'HPND', - pattern('(Licen[cs]eRef-)?PIL'), - # The Pillow or Python Image Library is an HPND license, which is a simple permissive license: - # Refs: - # * https://github.com/python-pillow/Pillow/blob/main/LICENSE - # * https://www.fsf.org/blogs/licensing/historical-permission-notice-and-disclaimer-added-to-license-list - - # The IJG license, used by Independent JPEG Group (IJG) is a custom permissive license. - # Refs: - # * https://en.wikipedia.org/wiki/Libjpeg - # * https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/LICENSE.md - 'IJG', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'ISC License (ISCL)', - 'ISC', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'MIT License', - 'MIT', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Mozilla Public License 2.0 (MPL 2.0)', - 'MPL-1.1', - 'MPL-2.0', - - # The SIL Open Font License appears to be a copyleft-style license that applies narrowly - # to icons and not to the entire codebase. It is advertised as OK for use even in commercial - # applications. - # Ref: https://fontawesome.com/license/free - 'OFL-1.1', - - # Ref: https://en.wikipedia.org/wiki/Public_domain - pattern('(Licen[cs]eRef-)?Public[- ]Domain([- ]dedic[t]?ation)?'), # "dedictation" is a typo in docutils - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - pattern('(Licen[cs]eRef-)?PSF-2([.][.0-9]*)'), - 'Python Software Foundation License', - 'Python-2.0', - - # License = BSD-like - # Ref: https://en.wikipedia.org/wiki/Pylons_project - 'Repoze Public License', - - # The TCL or Tcl/Tk licenses are permissive licenses. - # Ref: https://www.tcl.tk/software/tcltk/license.html - # The one used by the tktable library has a 'bourbon' clause that doesn't add compliance requirements - # Ref: https://github.com/wjoye/tktable/blob/master/license.txt - pattern('Tcl([/]tk)?'), - - # The Ubuntu Font Licence is mostly permissive. It contains some restrictions if you are going to modify the - # fonts that require you to change the name to avoid confusion. But for our purposes, we're assuming that's - # not done, and so we're not flagging it. - pattern('Ubuntu Font Licen[cs]e Version( 1([.]0)?)?'), - - # Linking = Permissive/Public domain, Private Use = Permissive/Public domain - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'The Unlicense (Unlicense)', - 'Unlicense', - - # Various licenses seem to call themselves or be summed up as unlimited. - # So far we know of none that are not highly permissive. - # * boot and KernSmooth are reported by R as being 'Unlimited' - # Refs: - # * https://cran.r-project.org/web/packages/KernSmooth/index.html - # (https://github.com/cran/KernSmooth/blob/master/LICENCE.note) - # * https://cran.r-project.org/package=boot - # (https://github.com/cran/boot/blob/master/DESCRIPTION) - 'Unlimited', - - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'W3C License', - 'W3C-20150513', - - # Linking = Permissive/Public Domain, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'WTFPL', - - # Copyleft = No - # Ref: https://en.wikipedia.org/wiki/Zlib_License - # Linking = Permissive, Private Use = ? (for zlib/libpng license) - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Zlib', - - # Copyleft = No, FSF/OSI-approved: Yes - # Ref: https://en.wikipedia.org/wiki/Zope_Public_License - 'Zope Public License', - ] - - EXCEPTIONS = { - - # The Bioconductor zlibbioc license is a permissive license. - # Ref: https://github.com/Bioconductor/zlibbioc/blob/devel/LICENSE - 'Custom: bioconductor-zlibbioc file LICENSE': [ - 'bioconductor-zlibbioc' - ], - - # The Bioconductor rsamtools license is an MIT license - # Ref: https://bioconductor.org/packages/release/bioc/licenses/Rsamtools/LICENSE - 'Custom: bioconductor-rsamtools file LICENSE': [ - 'bioconductor-rsamtools' - ], - - # DFSG = Debian Free Software Guidelines - # Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines - # Used as an apparent modifier to other licenses, to say they are approved per Debian. - # For example in this case, pytest-timeout has license: DFSG approved, MIT License, - # but is really just an MIT License that someone has checked is DFSG approved. - 'DFSG approved': [ - 'pytest-timeout', # MIT Licensed - ], - - 'FOSS': [ - # The r-stringi library is a conda library that implements a stringi (pronounced "stringy") library for R. - # The COnda source feed is: https://github.com/conda-forge/r-stringi-feedstock - # This page explains that the home source is https://stringi.gagolewski.com/ but that's a doc page. - # The doc page says: - # > stringi’s source code is hosted on GitHub. - # > It is distributed under the open source BSD-3-clause license. - # The source code has a license that begins with a BSD-3-clause license and includes numerous others, - # but they all appear to be permissive. - # Ref: https://github.com/gagolews/stringi/blob/master/LICENSE - 'stringi', 'r-stringi', - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ - 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ - # used only privately in testing, not used in server code, not modified, not distributed - 'pytest-redis', - # required by pytest-redis (used only where it's used) - 'mirakuru', - ], - - 'GNU General Public License (GPL)': [ - 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs - ], - - 'MIT/X11 Derivative': [ - # The license used by libxkbcommon is complicated and involves numerous included licenses, - # but all are permissive. - # Ref: https://github.com/xkbcommon/libxkbcommon/blob/master/LICENSE - 'libxkbcommon', - ], - - 'None': [ - # It's not obvious why Conda shows this license as 'None'. - # In fact, though, BSD 3-Clause "New" or "Revised" License - # Ref: https://github.com/AnacondaRecipes/_libgcc_mutex-feedstock/blob/master/LICENSE.txt - '_libgcc_mutex', - ], - - 'PostgreSQL': [ - # The libpq library is actually licensed with a permissive BSD 3-Clause "New" or "Revised" License - # Ref: https://github.com/lpsmith/postgresql-libpq/blob/master/LICENSE - 'libpq', - ], - - 'UCSD': [ - # It isn't obvious why these show up with a UCSD license in Conda. - # The actual sources say it should be a 2-clause BSD license: - # Refs: - # * https://github.com/AlexandrovLab/SigProfilerMatrixGenerator/blob/master/LICENSE - # * https://github.com/AlexandrovLab/SigProfilerPlotting/blob/master/LICENSE - 'sigprofilermatrixgenerator', - 'sigprofilerplotting', - ], - - 'X11': [ - # The ncurses library has a VERY complicated history, BUT seems consistently permissive - # and the most recent version seems to be essentially the MIT license. - # Refs: - # * https://en.wikipedia.org/wiki/Ncurses#License - # * https://invisible-island.net/ncurses/ncurses-license.html - 'ncurses' - ], - - 'zlib-acknowledgement': [ - # It isn't clear whey libpng shows up with this license name, but the license for libpng - # is a permissive license. - # Ref: https://github.com/glennrp/libpng/blob/libpng16/LICENSE - 'libpng', - ], - - } - - EXPECTED_MISSING_LICENSES = [ - - # This is a name we use for our C4 portals. And it isn't published. - # We inherited the name from the Stanford ENCODE group, which had an MIT-licensed repo we forked - 'encoded', # cgap-portal, fourfront, and smaht-portal all call themselves this - - # We believe that since these next here are part of the Pylons project, they're covered under - # the same license as the other Pylons projects. We're seeking clarification. - 'pyramid-translogger', - 'subprocess-middleware', - - # This appears to be a BSD 2-Clause "Simplified" License, according to GitHub. - # PyPi also says it's a BSD license. - # Ref: https://github.com/paulc/dnslib/blob/master/LICENSE - 'dnslib', - - # This says it wants an ISC License, which we already have approval for but just isn't showing up. - # Ref: https://github.com/rthalley/dnspython/blob/master/LICENSE - 'dnspython', - - # This appears to be a mostly-MIT-style license. - # There are references to parts being in the public domain, though it's not obvious if that's meaningful. - # It's probably sufficient for our purposes to treat this as a permissive license. - # Ref: https://github.com/tlsfuzzer/python-ecdsa/blob/master/LICENSE - 'ecdsa', - - # This has an MIT license in its source repository - # Ref: https://github.com/xlwings/jsondiff/blob/master/LICENSE - 'jsondiff', - - # This has an MIT license in its source repository - # Ref: https://github.com/pkerpedjiev/negspy/blob/master/LICENSE - 'negspy', - - # This license statement is complicated, but seems adequately permissive. - # Ref: https://foss.heptapod.net/python-libs/passlib/-/blob/branch/stable/LICENSE - 'passlib', - - # This seems to be a BSD-3-Clause license. - # Ref: https://github.com/protocolbuffers/protobuf/blob/main/LICENSE - # pypi agrees in the Meta section of protobuf's page, where it says "3-Clause BSD License" - # Ref: https://pypi.org/project/protobuf/ - 'protobuf', - - # The WTFPL license is permissive. - # Ref: https://github.com/mk-fg/pretty-yaml/blob/master/COPYING - 'pyaml', - - # This uses a BSD license - # Ref: https://github.com/eliben/pycparser/blob/master/LICENSE - 'pycparser', - - # The source repo for pyDes says this is under an MIT license - # Ref: https://github.com/twhiteman/pyDes/blob/master/LICENSE.txt - # pypi, probably wrongly, thinks this is in the public domain (as of 2023-07-21) - # Ref: https://pypi.org/project/pyDes/ - 'pyDes', - - # This uses an MIT license - # Ref: https://github.com/pysam-developers/pysam/blob/master/COPYING - 'pysam', - - # The version of python-lambda that we forked calls itself this (and publishes at pypi under this name) - "python-lambda-4dn", - - # This is MIT-licensed: - # Ref: https://github.com/themiurgo/ratelim/blob/master/LICENSE - # pypi agrees - # Ref: https://pypi.org/project/ratelim/ - 'ratelim', - - # This is a BSD-3-Clause-Modification license - # Ref: https://github.com/repoze/repoze.debug/blob/master/LICENSE.txt - 'repoze.debug', - - # This is an Apache-2.0 license - # Ref: https://github.com/getsentry/responses/blob/master/LICENSE - 'responses', - - # This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault - # calls itself internally. In any case, it's under MIT license and OK. - # Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt - 'snovault', - - # PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" - # Ref: https://pypi.org/project/supervisor/ - # In fact, though, the license is a bit more complicated, though apparently still permissive. - # Ref: https://github.com/Supervisor/supervisor/blob/main/LICENSES.txt - 'supervisor', - - # This seems to be a BSD-3-Clause-Modification license. - # Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt - 'translationstring', - - # This seems to be a BSD-3-Clause-Modification license. - # Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt - 'venusian', - # PyPi identifies zope.deprecation as using the "Zope Public License (ZPL 2.1)" license. - # Ref: https://github.com/zopefoundation/Zope/blob/master/LICENSE.txt - 'zope.deprecation', - - # Below are licenses last known to have licenses missing in pip-licenses and need to be investigated further. - # Note well that just because pip-licenses doesn't know the license doesn't mean the software has - # no license. It may just mean the library is poorly registered in pypi. Some licenses have to be - # found by looking at the library's documentation or source files. - - # (all of these have been classified at this point) - - ] - - -@LicenseCheckerRegistry.register_checker('park-lab-pipeline') -class ParkLabPipelineLicenseChecker(ParkLabCommonLicenseChecker): +def read_license_policy_file(file): """ - Minimal checker common to pipelines from Park Lab. + Reads a license policy file, which is a JSONC file (can contain JSON with Javascript-style comments) + The policy is a dictionary, but the ALLOWED option is a list that can contain special syntax allowing + a regular expression to be inferred. See documentation of `string_or_regexp_dict` for details. """ + data = JsoncParser.parse_file(file) + allowed = data.get('ALLOWED') + if isinstance(allowed, list): + # The "ALLOWED" option is specially permitted to contain regular expressions. + data['ALLOWED'] = [literal_string_or_regexp_from_dict(allowance) for allowance in allowed] + return data - LICENSE_FRAMEWORKS = ['python', 'conda', 'r'] +_MY_DIR = os.path.dirname(__file__) -@LicenseCheckerRegistry.register_checker('park-lab-gpl-pipeline') -class ParkLabGplPipelineLicenseChecker(ParkLabCommonLicenseChecker): - """ - Minimal checker common to GPL pipelines from Park Lab. - """ +POLICY_DIR = os.path.join(_MY_DIR, "license_policies") - ALLOWED = ParkLabPipelineLicenseChecker.ALLOWED + [ - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - # The "exceptions", if present, indicate waivers to source delivery requirements. - # Ref: https://spdx.org/licenses/LGPL-3.0-linking-exception.html - pattern('GNU Lesser General Public License v2( or later)?( [(]LGPL[v]?[23][+]?[)])?'), - # 'GNU Lesser General Public License v2 or later (LGPLv2+)', - # 'GNU Lesser General Public License v3 or later (LGPLv3+)', - # 'LGPLv2', 'LGPL-v2', 'LGPL-v2.0', 'LGPL-2', 'LGPL-2.0', - # 'LGPLv2+', 'LGPL-v2+', 'LGPL-v2.0+', 'LGPL-2+', 'LGPL-2.0+', - # 'LGPLv3', 'LGPL-v3', 'LGPL-v3.0', 'LGPL-3', 'LGPL-3.0', - # 'LGPLv3+', 'LGPL-v3+', 'LGPL-v3.0+', 'LGPL-3+', 'LGPL-3.0+', - pattern('LGPL[v-]?[.0-9]*([+]|-only)?([- ]with[- ]exceptions)?'), - - # Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK - # for pipeline or server use as long as we're not distributing sources. - 'LGPL', - 'GNU Library or Lesser General Public License (LGPL)', - - # GPL - # * library exception operates like LGPL - # * classpath exception is a linking exception related to Oracle - # Refs: - # * https://www.gnu.org/licenses/old-licenses/gpl-1.0.en.html - # * https://spdx.org/licenses/GPL-2.0-with-GCC-exception.html - # * https://spdx.org/licenses/GPL-3.0-with-GCC-exception.html - pattern('(GNU General Public License|GPL)[ ]?[v-]?[123]([.]0)?([+]|[- ]only)?' - '([- ]with[- ]GCC(([- ]runtime)?[- ]library)?[- ]exception([- ][.0-9]*)?)?' - '([- ]with[- ]Classpath[- ]exception([- ][.0-9]+)?)?'), - - # Linking = "GPLv3 compatible only", Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GPL-2-or-3', # we sometimes generate this token - # 'GPLv2+', 'GPL-v2+', 'GPL-v2.0+', 'GPL-2+', 'GPL-2.0+', - # 'GPLv3', 'GPL-v3', 'GPL-v3.0', 'GPL-3', 'GPL-3.0', - # 'GPLv3+', 'GPL-v3+', 'GPL-v3.0+', 'GPL-3+', 'GPL-3.0+', - # 'GPLv3-only', 'GPL-3-only', 'GPL-v3-only', 'GPL-3.0-only', 'GPL-v3.0-only', - - # Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. - # And version 3 is our preferred interpretation. - 'GNU General Public License', - 'GPL', - - RLicenseFramework.R_LANGUAGE_LICENSE_NAME - - ] - - -@LicenseCheckerRegistry.register_checker('park-lab-common-server') -class ParkLabCommonServerLicenseChecker(ParkLabCommonLicenseChecker): - """ - Checker for servers from Park Lab. +POLICY_DATA_CACHE = {} - If you're at some other organization, we recommend you make a class that has values - suitable to your own organizational needs. - """ - LICENSE_FRAMEWORKS = ['python', 'javascript'] - - EXCEPTIONS = augment( - ParkLabCommonLicenseChecker.EXCEPTIONS, - by={ - 'BSD*': [ - # Although modified to insert the author name into the license text itself, - # the license for these libraries are essentially BSD-3-Clause. - 'formatio', - 'samsam', - - # There are some slightly different versions of what appear to be BSD licenses here, - # but clearly the license is permissive. - # Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme - 'mutation-observer', - ], - - 'Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global': [ - # The use of this URL appears to be a syntax error in the definition of entries-ponyfill - # In fact this seems to be covered by a CC0-1.0 license. - # Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE - 'object.entries-ponyfill', - ], - - 'Custom: https://github.com/saikocat/colorbrewer.': [ - # The use of this URL appears to be a syntax error in the definition of cartocolor - # In fact, this seems to be covered by a CC-BY-3.0 license. - # Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme - 'cartocolor', - ], - - 'Custom: https://travis-ci.org/component/emitter.png': [ - # The use of this png appears to be a syntax error in the definition of emitter-component. - # In fact, emitter-component uses an MIT License - # Ref: https://www.npmjs.com/package/emitter-component - # Ref: https://github.com/component/emitter/blob/master/LICENSE - 'emitter-component', - ], - - # The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) - # seems to lack a license, but appears to be forked from the jsts library that uses - # the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive - # license is intended. - 'Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg': [ - 'turf-jsts' - ], - - 'GNU General Public License (GPL)': [ - 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - # 'GNU Lesser General Public License v3 or later (LGPLv3+)', - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Library or Lesser General Public License (LGPL)': [ - 'psycopg2', # Used at runtime during server operation, but not modified or distributed - 'psycopg2-binary', # Used at runtime during server operation, but not modified or distributed - 'chardet', # Potentially used downstream in loadxl to detect charset for text files - 'pyzmq', # Used in post-deploy-perf-tests, not distributed, and not modified or distributed - ], - - 'GPL-2.0': [ - # The license file for the node-forge javascript library says: - # - # "You may use the Forge project under the terms of either the BSD License or the - # GNU General Public License (GPL) Version 2." - # - # (We choose to use it under the BSD license.) - # Ref: https://www.npmjs.com/package/node-forge?activeTab=code - 'node-forge', - ], - - 'MIT*': [ - - # This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. - # (It also mentions that some tools for building/testing use other libraries.) - # Ref: https://github.com/requirejs/domReady/blob/master/LICENSE - 'domready', - - # This library is under 'COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1' - # Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt - # About CDDL ... - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'jsonp', - - # This library says pretty clearly it intends MIT license. - # Ref: https://www.npmjs.com/package/component-indexof - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'component-indexof', - - # These look like a pretty straight MIT license. - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'mixin', # LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code - 'stack-trace', # https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE - 'typed-function', # LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code - - ], - - 'UNLICENSED': [ - # The udn-browser library is our own and has been observed to sometimes show up in some contexts - # as UNLICENSED, when really it's MIT. - # Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE - 'udn-browser', - ], - }) - - -@LicenseCheckerRegistry.register_checker('c4-infrastructure') -class C4InfrastructureLicenseChecker(ParkLabCommonServerLicenseChecker): - """ - Checker for C4 infrastructure (Fourfront, CGAP, SMaHT) from Park Lab. - """ +def built_in_policy_names(): + return [ + os.path.splitext(os.path.basename(license_policy_path))[0] + for license_policy_path in glob.glob(os.path.join(POLICY_DIR, "*.jsonc"))] - LICENSE_TITLE = "(The )?MIT License" + +def find_policy_data(policy_name: str, policy_dir: Optional[str] = None, + use_cache: bool = True, error_if_missing: bool = True): + policy_dir = POLICY_DIR if policy_dir is None else policy_dir + existing_data = POLICY_DATA_CACHE.get(policy_name) if use_cache else None + if existing_data: + return existing_data + else: + filename = os.path.join(policy_dir, policy_name + ".jsonc") + if not os.path.exists(filename): + if error_if_missing: + raise ValueError(f"No such policy: {policy_name!r}") + else: + return None + data = read_license_policy_file(filename) + POLICY_DATA_CACHE[policy_name] = data + return data -@LicenseCheckerRegistry.register_checker('c4-python-infrastructure') -class C4PythonInfrastructureLicenseChecker(C4InfrastructureLicenseChecker): +def find_or_create_license_class(*, policy_name: str, policy_dir: str, for_env, + # This next argument should never be passed explicitly by callers other than + # recursive calls to this function. -kmp 28-Sep-2023 + _creation_attmpts_in_progress=None): + """ + Define a policy class given a policy name (like 'c4-infrastructure'). + """ + _creation_attmpts_in_progress = _creation_attmpts_in_progress or [] + existing_checker = LicenseCheckerRegistry.find_checker(checker_name=policy_name) + if existing_checker: + return existing_checker + elif policy_name in _creation_attmpts_in_progress: + raise ValueError(f"Circular reference to {policy_name} detected" + f" while creating {conjoined_list(_creation_attmpts_in_progress)}.") + _creation_attmpts_in_progress.append(policy_name) + license_checker_class_name = to_camel_case(policy_name) + "LicenseChecker" + policy_data = find_policy_data(policy_name, policy_dir=policy_dir) + inherits_from = policy_data.get('inherits_from') + if not isinstance(inherits_from, list): + raise ValueError(f'Policy {policy_name!r} needs "inherits_from": [...parent names...],' + f' which may be empty but must be specified.') + license_frameworks = policy_data.get('LICENSE_FRAMEWORKS') + if license_frameworks == "ALL": + policy_data['LICENSE_FRAMEWORKS'] = LicenseFrameworkRegistry.all_framework_names() + parent_classes = [find_or_create_license_class(policy_name=parent_name, policy_dir=policy_dir, for_env=for_env, + _creation_attmpts_in_progress=_creation_attmpts_in_progress) + for parent_name in inherits_from] + defaulted_policy_data = default_policy_data(policy_name=policy_name, policy_data=policy_data, + parent_classes=parent_classes) + new_class = type(license_checker_class_name, + (*parent_classes, LicenseChecker), + {'_policy_data': policy_data, **defaulted_policy_data}) + new_class.__doc__ = policy_data.get("description") or f'License policy {policy_name} needs a "description".' + assert isinstance(new_class, type) and issubclass(new_class, LicenseChecker) # Sigh. PyCharm can't figure this out + license_policy_class: Type[LicenseChecker] = new_class + decorator = LicenseCheckerRegistry.register_checker(name=policy_name) + registered_class = decorator(license_policy_class) + command = f"{license_checker_class_name} = LicenseCheckerRegistry.lookup_checker({repr(policy_name)})" + if LicenseOptions.DEBUG: # pragma: no cover - this doesn't have to work for production + PRINT(f"Executing: {command}") + exec(command, for_env) + if LicenseOptions.DEBUG: # pragma: no cover - this doesn't have to work for production + PRINT(f" {license_checker_class_name}.LICENSE_FRAMEWORKS" + f" = {eval(license_checker_class_name, for_env).LICENSE_FRAMEWORKS!r}") + _creation_attmpts_in_progress.remove(policy_name) + return registered_class + + +def use_policy_literal(*, policy_name, policy_datum, other_policy_data): + """This is used for datum that requires no merging. The policy_datum is returned. Other arguments are ignored.""" + ignored(policy_name, other_policy_data) + return policy_datum + + +def str_or_regexp_sort_key(datum: Union[str, Regexp]): """ - Checker for C4 python library infrastructure (Fourfront, CGAP, SMaHT) from Park Lab. + Returns a key for a datum that is an element of a list of elements that are strings or compiled regular expressions. + Regular expressions will sort where their parttern would be in the series of strings. """ - LICENSE_FRAMEWORKS = ['python'] + # Rationale: We want something like this just to make testing predictable. + if isinstance(datum, str): + return datum + else: + return datum.pattern -@LicenseCheckerRegistry.register_checker('scan2-pipeline') -class Scan2PipelineLicenseChecker(ParkLabGplPipelineLicenseChecker): +def merge_policy_lists(*, policy_name, policy_datum, other_policy_data, sort_key=None): """ - Checker for SCAN2 library from Park Lab. + Merges a set of policy lists by appending them and de-duplicating. + By default, the result list is assumed to be homogenous in type and suitable for sorting. + If the list is of heterogeneous type, a sort_key is must be supplied to allow a total ordering. """ + ignored(policy_name) + result = policy_datum + for other_datum in other_policy_data: + result += other_datum + # de-duplicate and apply a deterministic ordering to make testing easier. + return sorted(set(result), key=sort_key) - EXCEPTIONS = augment( - ParkLabGplPipelineLicenseChecker.EXCEPTIONS, - by={ - 'Custom: Matrix file LICENCE': [ - # The custom information in https://cran.r-project.org/web/packages/Matrix/LICENCE - # says there are potential extra restrictions beyond a simple GPL license - # if SparseSuite is used, but it is not requested explicitly by Scan2, and we're - # trusting that any other libraries used by Scan2 would have investigated this. - # So, effectively, we think the Matrix library for this situation operates the - # same as if it were just GPL-3 licensed, and we are fine with that. - 'Matrix' - ], - - "MISSING": [ - # mysql-common and mysql-libs are GPL, but since they are delivered by conda - # and not distributed as part of the Scan2 distribution, they should be OK. - # Ref: https://redresscompliance.com/mysql-license-a-complete-guide-to-licensing/#:~:text=commercial%20use # noQA - 'mysql-common', - 'mysql-libs', - - # This is our own library - 'r-scan2', 'scan2', - ] - } - ) - EXPECTED_MISSING_LICENSES = ParkLabGplPipelineLicenseChecker.EXPECTED_MISSING_LICENSES + [ +def merge_policy_strings_or_regexps(*, policy_name, policy_datum, other_policy_data): + return merge_policy_lists(policy_name=policy_name, policy_datum=policy_datum, other_policy_data=other_policy_data, + sort_key=str_or_regexp_sort_key) + + +def merge_policy_dicts(*, policy_name, policy_datum, other_policy_data): + ignored(policy_name) + merged = defaultdict(lambda: []) + + def add_to_merged(d): + for k, values in d.items(): + for value in values: + merged[k].append(value) + + add_to_merged(policy_datum) + for other_datum in other_policy_data: + add_to_merged(other_datum) + + return {k: sorted(set(v)) for k, v in sorted(merged.items())} + + +POLICY_ATTRS: callable = { + 'class_key': use_policy_literal, + 'class_name': use_policy_literal, + 'inherits_from': use_policy_literal, + 'description': use_policy_literal, + 'LICENSE_TITLE': use_policy_literal, + 'COPYRIGHT_OWNER': use_policy_literal, + 'LICENSE_FRAMEWORKS': use_policy_literal, + 'ALLOWED': merge_policy_strings_or_regexps, + 'EXPECTED_MISSING_LICENSES': merge_policy_lists, + 'EXCEPTIONS': merge_policy_dicts, +} + +POLICY_MERGE_LISTS = {'ALLOWED', 'EXPECTED_MISSING_LICENSES'} +POLICY_MERGE_DICTS = {'EXCEPTIONS'} + + +def get_attrs_for_classes(attr: str, class_data: List[Type]): + result = [] + for class_datum in class_data: + attr_val = getattr(class_datum, attr, None) # Intentionally treats explicit None the same as missing + if attr_val is not None: + result.append(attr_val) + return result + + +def default_policy_data(*, policy_name: str, policy_data: AnyJsonData, parent_classes: List[Type]): + result = {} + for key_to_default, val_to_be_defaulted in policy_data.items(): + attr_handler: Optional[callable] = POLICY_ATTRS.get(key_to_default) + if attr_handler is None: + raise ValueError(f"Bad policy attribute: {key_to_default}") + result[key_to_default] = attr_handler(policy_name=policy_name, policy_datum=val_to_be_defaulted, + other_policy_data=get_attrs_for_classes(key_to_default, parent_classes)) + return result + + +def load_license_policies(for_env, policy_dir=None): + for policy_name in built_in_policy_names(): + find_or_create_license_class(policy_name=policy_name, policy_dir=policy_dir, for_env=for_env) + - ] +load_license_policies(for_env=globals()) diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 115fd00ff..66ef4a371 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -1344,7 +1344,11 @@ def to_camel_case(s): """ Converts a string that might be in snake_case or CamelCase into CamelCase. """ - if s[:1].isupper() and '_' not in s: + hyphen_found = False + if '-' in s: + hyphen_found = True + s = s.replace('-', '_') + if not hyphen_found and s[:1].isupper() and '_' not in s: return s else: return snake_case_to_camel_case(s) diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py index f2324c0cf..0742ac8e2 100644 --- a/dcicutils/scripts/run_license_checker.py +++ b/dcicutils/scripts/run_license_checker.py @@ -10,7 +10,8 @@ EPILOG = __doc__ -ALL_CHECKER_NAMES = LicenseCheckerRegistry.all_checker_names() +ALL_CHECKER_NAMES = sorted(LicenseCheckerRegistry.all_checker_names(), + key=lambda x: 'aaaaa-' + x if x.startswith('park-lab-') else x) NEWLINE = '\n' diff --git a/poetry.lock b/poetry.lock index d7e77523c..c59c8f953 100644 --- a/poetry.lock +++ b/poetry.lock @@ -884,6 +884,18 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "jsonc-parser" +version = "1.1.5" +description = "A lightweight, native tool for parsing .jsonc files" +category = "main" +optional = false +python-versions = ">=3.5" +files = [ + {file = "jsonc-parser-1.1.5.tar.gz", hash = "sha256:7126d17725b0413cd40af4297d9f6412c4181a62135e4c41cdf8f6a82c5936e6"}, + {file = "jsonc_parser-1.1.5-py3-none-any.whl", hash = "sha256:abd1db76a4c6d1733ec7bb5340a89c49cbc878a181a1e7947ee6719eedf2c6cc"}, +] + [[package]] name = "mccabe" version = "0.7.0" @@ -1594,4 +1606,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "ca11caee3bf14b381e0aaec68ca6bca23f89064db9d90a61e9500e23eab8106f" diff --git a/pyproject.toml b/pyproject.toml index 1fdb3578b..f919add77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" docker = "^4.4.4" gitpython = "^3.1.2" +jsonc-parser = "^1.1.5" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" requests = "^2.21.0" diff --git a/test/test_license_utils.py b/test/test_license_utils.py index bb87ce8d8..d8e352556 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -1,19 +1,26 @@ import copy import datetime +import glob import io import json import os import pytest +import re import subprocess as subprocess_module from collections import defaultdict +from dcicutils import license_utils as license_utils_module +from dcicutils.common import Regexp from dcicutils.license_utils import ( - LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, + POLICY_DIR, + LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, LicenseCheckerRegistry, PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, - extract_boolean_terms, simplify_license_versions, + extract_boolean_terms, simplify_license_versions, load_license_policies, literal_string_or_regexp_from_dict, + default_policy_data, str_or_regexp_sort_key, get_attrs_for_classes, find_or_create_license_class, + use_policy_literal, merge_policy_lists, merge_policy_strings_or_regexps, merge_policy_dicts, built_in_policy_names, ) from dcicutils.misc_utils import ignored, file_contents, local_attrs from dcicutils.qa_utils import printed_output, MockFileSystem @@ -760,3 +767,229 @@ def mocked_license_logger(message): LicenseFileParser.validate_simple_license_file(filename='LICENSE.txt', analysis=analysis) assert analysis.miscellaneous == ["The copyright year, '2020', should have '2023' at the end."] assert license_warnings == [] + + +def test_default_policy_data(): + + class MyCondaClass(LicenseChecker): + LICENSE_FRAMEWORKS = ['conda'] + EXCEPTIONS = { + 'something': ['some-lib'] + } + + def check_it(input, expected, *, parents=None): + parents = parents or [] + assert default_policy_data(policy_name='some-policy', policy_data=input, parent_classes=parents) == expected + + check_it({'LICENSE_FRAMEWORKS': ['a', 'b']}, {'LICENSE_FRAMEWORKS': ['a', 'b']}) + check_it({'LICENSE_FRAMEWORKS': ['a', 'b']}, {'LICENSE_FRAMEWORKS': ['a', 'b']}, parents=[MyCondaClass]) + check_it({}, {}, parents=[MyCondaClass]) + + check_it( + { + 'EXCEPTIONS': { + 'something': ['some-random-lib'], + 'something-else': ['some-other-lib'] + } + }, + { + 'EXCEPTIONS': { + 'something': ['some-lib', 'some-random-lib'], + 'something-else': ['some-other-lib'] + } + }, + parents=[MyCondaClass]) + + +def test_use_policy_literal(): + + class MyIgnoredLicenseChecker(LicenseChecker): + pass + + assert use_policy_literal(policy_name='ignored', policy_datum='anything', + other_policy_data=[MyIgnoredLicenseChecker]) == 'anything' + + +def test_str_or_regexp_sort_key(): + + assert str_or_regexp_sort_key('foo') == 'foo' + assert str_or_regexp_sort_key(re.compile('foo')) == 'foo' + + +def test_merge_policy_lists(): + + list1 = ['a', 'c', 'b'] + list2 = ['f', 'a'] + list3 = ['g', 'a'] + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[]) + expected = ['a', 'b', 'c'] + assert actual == expected + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[list2]) + expected = ['a', 'b', 'c', 'f'] + assert actual == expected + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[list2, list3]) + expected = ['a', 'b', 'c', 'f', 'g'] + assert actual == expected + + with pytest.raises(Exception): + merge_policy_lists(policy_name='ignored', policy_datum=['a', re.compile('foo')], other_policy_data=[]) + + +def test_merge_policy_strings_or_regexps(): + + regexp_foo = re.compile('foo') + regexp_bar = re.compile('bar') + + list1 = ['a', regexp_foo, 'c', 'b'] + list2 = ['f', regexp_bar, 'a'] + list3 = [regexp_foo, 'g', 'a'] + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, other_policy_data=[]) + expected = ['a', 'b', 'c', regexp_foo] + assert actual == expected + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, other_policy_data=[list2]) + expected = ['a', 'b', regexp_bar, 'c', 'f', regexp_foo] + assert actual == expected + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, + other_policy_data=[list2, list3]) + expected = ['a', 'b', regexp_bar, 'c', 'f', regexp_foo, 'g'] + assert actual == expected + + +def test_merge_policy_dicts(): + + dict1 = {'foo': ['a', 'b'], 'bar': ['x', 'z']} + dict2 = {'alpha': ['p', 'q']} + dict3 = {'foo': ['a', 'c'], 'baz': ['z', 'w']} + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[]) + expected = {'bar': ['x', 'z'], 'foo': ['a', 'b']} + assert actual == expected + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[dict2]) + expected = {'alpha': ['p', 'q'], 'bar': ['x', 'z'], 'foo': ['a', 'b']} + assert actual == expected + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[dict2, dict3]) + expected = {'alpha': ['p', 'q'], 'bar': ['x', 'z'], 'baz': ['w', 'z'], 'foo': ['a', 'b', 'c']} + assert actual == expected + + +def test_get_attrs_for_classes(): + + class ClassA: + PROP1 = 'val1A' + PROP2 = 'val2A' + + class ClassB: + PROP2 = 'val2B' + PROP3 = 'val3B' + + class ClassC: + PROP1 = 'val1C' + + class ClassAB(ClassA): + PROP1 = 'val1AB' + PROP2 = None + + # Note that the order of the results is the order of the classes in which the value occurs, NOT alphabetical. + + assert get_attrs_for_classes('PROP1', [ClassA]) == ['val1A'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB]) == ['val1A'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB, ClassC]) == ['val1A', 'val1C'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB, ClassC, ClassAB]) == ['val1A', 'val1C', 'val1AB'] + assert get_attrs_for_classes('PROP1', [ClassAB, ClassA, ClassB, ClassC]) == ['val1AB', 'val1A', 'val1C'] + + assert get_attrs_for_classes('PROP2', [ClassA]) == ['val2A'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB]) == ['val2A', 'val2B'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB, ClassC]) == ['val2A', 'val2B'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB, ClassC, ClassAB]) == ['val2A', 'val2B'] # None is ignored + assert get_attrs_for_classes('PROP2', [ClassAB, ClassA, ClassB, ClassC]) == ['val2A', 'val2B'] # ditto + + assert get_attrs_for_classes('PROP3', [ClassA]) == [] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC, ClassAB]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC, ClassAB]) == ['val3B'] + + +def test_literal_string_or_regexp_from_dict(): + + print() # start on a fresh line + + sample_string = "foo" + assert literal_string_or_regexp_from_dict(sample_string) == sample_string + + sample_regexp_pattern_1 = "foo.*" + sample_regexp_pattern_2 = "(bar)" + sample_regexp_pattern_3 = sample_regexp_pattern_1 + sample_regexp_pattern_2 + + default_flags = re.UNICODE | re.IGNORECASE + + result = literal_string_or_regexp_from_dict({"pattern": sample_regexp_pattern_1}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_1 + assert result.flags == default_flags + + result = literal_string_or_regexp_from_dict({"pattern": [sample_regexp_pattern_1, sample_regexp_pattern_2]}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_3 + + result = literal_string_or_regexp_from_dict({"pattern": sample_regexp_pattern_1, "flags": ["VERBOSE"]}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_1 + assert result.flags == default_flags | re.VERBOSE + + +def test_find_or_create_license_class(): + test_registry = {} + policy_data_cache = {} + + class TestChecker(LicenseChecker): + pass + + with mock.patch.object(license_utils_module, "find_policy_data") as mock_find_policy_data: + with mock.patch.object(LicenseCheckerRegistry, "REGISTRY", test_registry): + with mock.patch.object(license_utils_module, "POLICY_DATA_CACHE", policy_data_cache): + + # This tests the find part + test_registry['test'] = TestChecker + assert find_or_create_license_class(policy_name='test', policy_dir='ignored', + for_env='ignored') == TestChecker + mock_find_policy_data.assert_not_called() + + mock_find_policy_data.return_value = {"inherits_from": []} + local_env = locals() + # The command that gets executed will expect to use LicenseCheckerRegistry, which would be in globals() + # but is not in locals(), so we have to add it. -kmp 29-Sep-2023 + local_env["LicenseCheckerRegistry"] = license_utils_module.LicenseCheckerRegistry + policy_class = find_or_create_license_class(policy_name='something', policy_dir='/my/policy/dir', + for_env=local_env) + assert local_env["SomethingLicenseChecker"] == policy_class # check that it got installed in environment + assert issubclass(policy_class, LicenseChecker) + + +def test_load_license_policies(): + test_policy_names = ['my_project', 'your_project'] + policy_dir_for_testing = 'some/dir/' + some_env = 'some-env' + with mock.patch.object(license_utils_module, "find_or_create_license_class") as mock_find_or_create_license_class: + with mock.patch.object(license_utils_module, "built_in_policy_names") as mock_built_in_policy_names: + mock_built_in_policy_names.return_value = test_policy_names + load_license_policies(policy_dir=policy_dir_for_testing, for_env=some_env) + mock_find_or_create_license_class.assert_has_calls([ + mock.call(policy_name=policy_name, policy_dir=policy_dir_for_testing, for_env=some_env) + for policy_name in test_policy_names + ]) + + +def test_built_in_policy_names(): + test_project_names = ['my_project', 'your_project'] + with mock.patch.object(glob, "glob") as mock_glob_glob: + mock_glob_glob.return_value = [os.path.join(POLICY_DIR, f"{name}.jsonc") for name in test_project_names] + assert built_in_policy_names() == test_project_names diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index 778aabac3..5b80a8ae7 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -2000,8 +2000,9 @@ def test_snake_case_to_camel_case_hyphenated(token, expected): ('x_m_l_container', 'XMLContainer'), ('X_M_L_Container', 'XMLContainer'), ]) -def test_to_camel_case_hyphenated(token, expected): +def test_to_camel_case(token, expected): assert to_camel_case(token) == expected + assert to_camel_case(token.replace('_', '-')) == expected assert to_camel_case(expected) == expected # make sure it's stable From 9978cb8ae5ef7c8adaef470cfc207b0769e035e0 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 09:34:38 -0400 Subject: [PATCH 19/24] Adjust the run-license-checker command to look for custom policies. --- dcicutils/license_utils.py | 15 +++++++++++--- dcicutils/scripts/run_license_checker.py | 25 ++++++++++++++---------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 93abc42be..399b47831 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -77,14 +77,15 @@ class LicenseOptions: # Specific additional debugging output DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False) CONDA_PREFIX = os.environ.get("CONDA_LICENSE_CHECKER_PREFIX", os.environ.get("CONDA_PREFIX", "")) + POLICY_DIR = os.environ.get("LICENSE_UTILS_POLICY_DIR") @classmethod @contextlib.contextmanager - def selected_options(cls, verbose=VERBOSE, debug=DEBUG, conda_prefix=CONDA_PREFIX): + def selected_options(cls, verbose=VERBOSE, debug=DEBUG, conda_prefix=CONDA_PREFIX, policy_dir=POLICY_DIR): """ Allows a script, for example, to specify overrides for these options dynamically. """ - with local_attrs(cls, VERBOSE=verbose, DEBUG=debug, CONDA_PREFIX=conda_prefix): + with local_attrs(cls, VERBOSE=verbose, DEBUG=debug, CONDA_PREFIX=conda_prefix, POLICY_DIR=policy_dir): yield @@ -735,9 +736,17 @@ def find_checker(cls, checker_name: str) -> Optional[Type[LicenseChecker]]: return cls.REGISTRY.get(checker_name, None) @classmethod - def lookup_checker(cls, checker_name: str) -> Type[LicenseChecker]: + def lookup_checker(cls, checker_name: str, autoload: bool = False) -> Type[LicenseChecker]: result: Optional[Type[LicenseChecker]] = cls.find_checker(checker_name) if result is None: + if autoload: + policy_dir = LicenseOptions.POLICY_DIR or POLICY_DIR + PRINT(f"Looking for custom policy {checker_name} in {policy_dir} ...") + result = find_or_create_license_class(policy_name=checker_name, + policy_dir=policy_dir, + for_env=globals()) + if result: + return result raise InvalidParameterError(parameter='checker_name', value=checker_name, options=cls.all_checker_names()) return result diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py index 0742ac8e2..c07c06529 100644 --- a/dcicutils/scripts/run_license_checker.py +++ b/dcicutils/scripts/run_license_checker.py @@ -32,11 +32,14 @@ def main(): help="Requests additional debugging output.") parser.add_argument("--conda-prefix", "--conda_prefix", "--cp", default=LicenseOptions.CONDA_PREFIX, help=(f"Overrides the CONDA_PREFIX (default {LicenseOptions.CONDA_PREFIX!r}).")) + parser.add_argument("--policy-dir", "--policy_dir", "--pd", default=LicenseOptions.POLICY_DIR, + help=(f"Specifies a custom policy directory (default {LicenseOptions.POLICY_DIR!r}).")) args = parser.parse_args() with script_catch_errors(): - run_license_checker(name=args.name, verbose=not args.brief, debug=args.debug, conda_prefix=args.conda_prefix) + run_license_checker(name=args.name, verbose=not args.brief, debug=args.debug, conda_prefix=args.conda_prefix, + policy_dir=args.policy_dir) def show_help_for_choosing_license_checker(): @@ -63,16 +66,18 @@ def show_help_for_choosing_license_checker(): def run_license_checker(name: Optional[str], verbose=LicenseOptions.VERBOSE, debug=LicenseOptions.DEBUG, - conda_prefix=LicenseOptions.CONDA_PREFIX): + conda_prefix=LicenseOptions.CONDA_PREFIX, + policy_dir=LicenseOptions.POLICY_DIR): if name is None: show_help_for_choosing_license_checker() else: - try: - checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name) - except Exception as e: - raise ScriptFailure(str(e)) - try: - with LicenseOptions.selected_options(verbose=verbose, debug=debug, conda_prefix=conda_prefix): + with LicenseOptions.selected_options(verbose=verbose, debug=debug, conda_prefix=conda_prefix, + policy_dir=policy_dir): + try: + checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name, autoload=True) + except Exception as e: + raise ScriptFailure(str(e)) + try: checker_class.validate() - except LicenseCheckFailure as e: - raise ScriptFailure(get_error_message(e)) + except LicenseCheckFailure as e: + raise ScriptFailure(get_error_message(e)) From 03882425419f3c230e3070221513f07f2861319e Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 11:29:04 -0400 Subject: [PATCH 20/24] Updates due to Will's code review. --- dcicutils/license_utils.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 399b47831..d2bd5d553 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -497,17 +497,26 @@ def report(message): class LicenseChecker: """ - There are three important class variables to specify: + License checkers are defined as .jsonc. The JSONC file format is JSON with Comments. + (The comments are Javascript syntax, either '//' or '/* ... */'.) + + There are these important class variables to specify: LICENSE_TITLE is a string naming the license to be expected in LICENSE.txt COPYRIGHT_OWNER is the name of the copyright owner. - LICENSE_FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], but can be limited to - just ['python'] for example. It doesn't make a lot of sense to limit it to ['javascript'], though you could, - since you are using a Python library to do this, and it probably needs to have its dependencies checked. + LICENSE_FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], + but can be limited to just ['python'] for example. It doesn't make a lot of sense to limit it to + ['javascript'], though you could, since you are using a Python library to do this, and it probably + needs to have its dependencies checked. - ALLOWED is a list of license names as returned by the pip-licenses library. + ALLOWED is a list of license names as returned by the various license frameworks. Because they rely on different + underlying tools the exact format of the names that result might vary. (For this reason, there is a regular + expression capability for this particular attribute, so in addition to just a string, you can also use + {"pattern": ""} For very long regular expressions, {"pattern": ["", ...]} will + concatenate all the parts into a single regexp so they can be gracefully broken over lines in the .jsonc + source file. If regexp flags are requierd, use {"pattern" "", "flags": ["flag1", ...]}. EXPECTED_MISSING_LICENSES is a list of libraries that are expected to have no license information. This is so you don't have to get warning fatigue by seeing a warning over and over for things you know about. @@ -896,7 +905,11 @@ def find_or_create_license_class(*, policy_name: str, policy_dir: str, for_env, (*parent_classes, LicenseChecker), {'_policy_data': policy_data, **defaulted_policy_data}) new_class.__doc__ = policy_data.get("description") or f'License policy {policy_name} needs a "description".' - assert isinstance(new_class, type) and issubclass(new_class, LicenseChecker) # Sigh. PyCharm can't figure this out + # Sigh. PyCharm can't figure this out type fact out, even with a type hint on the above assignment to new_class, + # such as 'new_class: Type[LicenseChecker] = ...'. That should have worked. Putting in an assert was the only way + # I could find to convince PyCharm of the truth. I don't expect this assertion to ever fail. It's just an artifact + # to prevent ugly browser highlighting. I'll try to arrange a bug report for them. -kmp 29-Sep-2023 + assert isinstance(new_class, type) and issubclass(new_class, LicenseChecker) license_policy_class: Type[LicenseChecker] = new_class decorator = LicenseCheckerRegistry.register_checker(name=policy_name) registered_class = decorator(license_policy_class) @@ -1006,4 +1019,9 @@ def load_license_policies(for_env, policy_dir=None): find_or_create_license_class(policy_name=policy_name, policy_dir=policy_dir, for_env=for_env) +# This will cause the definitions of classes to in the predefined set to be exported by this library +# in case they need to be imported elsewhere, for example to use in unit-testing. Those are things like +# * ParkLabCommonLicenseChecker, etc. +# * C4InfrastructureLicenseChecker, etc. +# See license_policies/*.jsonc for a full list. load_license_policies(for_env=globals()) From bd57fa54995cb8c83effc62dac86158663cf81ac Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 12:25:28 -0400 Subject: [PATCH 21/24] Remove the exec() and eval() uses. --- dcicutils/license_utils.py | 35 +++++++++++++++++++---------------- test/test_license_utils.py | 16 ++++------------ 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index d2bd5d553..8d6553b0a 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -512,10 +512,10 @@ class LicenseChecker: needs to have its dependencies checked. ALLOWED is a list of license names as returned by the various license frameworks. Because they rely on different - underlying tools the exact format of the names that result might vary. (For this reason, there is a regular - expression capability for this particular attribute, so in addition to just a string, you can also use + underlying tools the exact format of the names that result might vary. For this reason, there is a regular + expression capability for this particular attribute. In addition to just a string, you can also use {"pattern": ""} For very long regular expressions, {"pattern": ["", ...]} will - concatenate all the parts into a single regexp so they can be gracefully broken over lines in the .jsonc + concatenate all the parts into a single regexp, so they can be gracefully broken over lines in the .jsonc source file. If regexp flags are requierd, use {"pattern" "", "flags": ["flag1", ...]}. EXPECTED_MISSING_LICENSES is a list of libraries that are expected to have no license information. @@ -752,8 +752,7 @@ def lookup_checker(cls, checker_name: str, autoload: bool = False) -> Type[Licen policy_dir = LicenseOptions.POLICY_DIR or POLICY_DIR PRINT(f"Looking for custom policy {checker_name} in {policy_dir} ...") result = find_or_create_license_class(policy_name=checker_name, - policy_dir=policy_dir, - for_env=globals()) + policy_dir=policy_dir) if result: return result raise InvalidParameterError(parameter='checker_name', value=checker_name, @@ -872,7 +871,7 @@ def find_policy_data(policy_name: str, policy_dir: Optional[str] = None, return data -def find_or_create_license_class(*, policy_name: str, policy_dir: str, for_env, +def find_or_create_license_class(*, policy_name: str, policy_dir: str, # This next argument should never be passed explicitly by callers other than # recursive calls to this function. -kmp 28-Sep-2023 _creation_attmpts_in_progress=None): @@ -896,7 +895,7 @@ def find_or_create_license_class(*, policy_name: str, policy_dir: str, for_env, license_frameworks = policy_data.get('LICENSE_FRAMEWORKS') if license_frameworks == "ALL": policy_data['LICENSE_FRAMEWORKS'] = LicenseFrameworkRegistry.all_framework_names() - parent_classes = [find_or_create_license_class(policy_name=parent_name, policy_dir=policy_dir, for_env=for_env, + parent_classes = [find_or_create_license_class(policy_name=parent_name, policy_dir=policy_dir, _creation_attmpts_in_progress=_creation_attmpts_in_progress) for parent_name in inherits_from] defaulted_policy_data = default_policy_data(policy_name=policy_name, policy_data=policy_data, @@ -913,13 +912,10 @@ def find_or_create_license_class(*, policy_name: str, policy_dir: str, for_env, license_policy_class: Type[LicenseChecker] = new_class decorator = LicenseCheckerRegistry.register_checker(name=policy_name) registered_class = decorator(license_policy_class) - command = f"{license_checker_class_name} = LicenseCheckerRegistry.lookup_checker({repr(policy_name)})" if LicenseOptions.DEBUG: # pragma: no cover - this doesn't have to work for production - PRINT(f"Executing: {command}") - exec(command, for_env) - if LicenseOptions.DEBUG: # pragma: no cover - this doesn't have to work for production - PRINT(f" {license_checker_class_name}.LICENSE_FRAMEWORKS" - f" = {eval(license_checker_class_name, for_env).LICENSE_FRAMEWORKS!r}") + found_class = LicenseCheckerRegistry.lookup_checker(policy_name) + PRINT(f"Registered checker class {policy_name!r}" + f" with license_frameworks {conjoined_list(found_class.LICENSE_FRAMEWORKS)}.") _creation_attmpts_in_progress.remove(policy_name) return registered_class @@ -1014,9 +1010,9 @@ def default_policy_data(*, policy_name: str, policy_data: AnyJsonData, parent_cl return result -def load_license_policies(for_env, policy_dir=None): +def load_license_policies(policy_dir=None): for policy_name in built_in_policy_names(): - find_or_create_license_class(policy_name=policy_name, policy_dir=policy_dir, for_env=for_env) + find_or_create_license_class(policy_name=policy_name, policy_dir=policy_dir) # This will cause the definitions of classes to in the predefined set to be exported by this library @@ -1024,4 +1020,11 @@ def load_license_policies(for_env, policy_dir=None): # * ParkLabCommonLicenseChecker, etc. # * C4InfrastructureLicenseChecker, etc. # See license_policies/*.jsonc for a full list. -load_license_policies(for_env=globals()) +load_license_policies() + +ParkLabCommonLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-common') +ParkLabCommonServerLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-common-server') +ParkLabPipelineLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-pipeline') +ParkLabGplPipelineLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-gpl-pipeline') +C4InfrastructureLicenseChecker = LicenseCheckerRegistry.lookup_checker('c4-infrastructure') +C4PythonInfrastructureLicenseChecker = LicenseCheckerRegistry.lookup_checker('c4-python-infrastructure') diff --git a/test/test_license_utils.py b/test/test_license_utils.py index d8e352556..e0d0fdc25 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -959,31 +959,23 @@ class TestChecker(LicenseChecker): # This tests the find part test_registry['test'] = TestChecker - assert find_or_create_license_class(policy_name='test', policy_dir='ignored', - for_env='ignored') == TestChecker + assert find_or_create_license_class(policy_name='test', policy_dir='ignored') == TestChecker mock_find_policy_data.assert_not_called() mock_find_policy_data.return_value = {"inherits_from": []} - local_env = locals() - # The command that gets executed will expect to use LicenseCheckerRegistry, which would be in globals() - # but is not in locals(), so we have to add it. -kmp 29-Sep-2023 - local_env["LicenseCheckerRegistry"] = license_utils_module.LicenseCheckerRegistry - policy_class = find_or_create_license_class(policy_name='something', policy_dir='/my/policy/dir', - for_env=local_env) - assert local_env["SomethingLicenseChecker"] == policy_class # check that it got installed in environment + policy_class = find_or_create_license_class(policy_name='something', policy_dir='/my/policy/dir') assert issubclass(policy_class, LicenseChecker) def test_load_license_policies(): test_policy_names = ['my_project', 'your_project'] policy_dir_for_testing = 'some/dir/' - some_env = 'some-env' with mock.patch.object(license_utils_module, "find_or_create_license_class") as mock_find_or_create_license_class: with mock.patch.object(license_utils_module, "built_in_policy_names") as mock_built_in_policy_names: mock_built_in_policy_names.return_value = test_policy_names - load_license_policies(policy_dir=policy_dir_for_testing, for_env=some_env) + load_license_policies(policy_dir=policy_dir_for_testing) mock_find_or_create_license_class.assert_has_calls([ - mock.call(policy_name=policy_name, policy_dir=policy_dir_for_testing, for_env=some_env) + mock.call(policy_name=policy_name, policy_dir=policy_dir_for_testing) for policy_name in test_policy_names ]) From ab9ee5406dc33655225e1ffd6ff78a92af8ca349 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 12:38:19 -0400 Subject: [PATCH 22/24] De-beta as 7.13.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f919add77..17a1703c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.12.0.1b4" # to become 7.13.0 +version = "7.13.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" From 20e61533a8a140c7f8c9a3bab297a34a326f1246 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 12:57:10 -0400 Subject: [PATCH 23/24] Small tweak to allow lookup_checker to use a specified policy dir when called from code. --- dcicutils/license_utils.py | 7 ++++--- dcicutils/scripts/run_license_checker.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index 8d6553b0a..ce01a5c80 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -745,11 +745,12 @@ def find_checker(cls, checker_name: str) -> Optional[Type[LicenseChecker]]: return cls.REGISTRY.get(checker_name, None) @classmethod - def lookup_checker(cls, checker_name: str, autoload: bool = False) -> Type[LicenseChecker]: + def lookup_checker(cls, checker_name: str, autoload: bool = False, + policy_dir: Optional[str] = None) -> Type[LicenseChecker]: result: Optional[Type[LicenseChecker]] = cls.find_checker(checker_name) if result is None: - if autoload: - policy_dir = LicenseOptions.POLICY_DIR or POLICY_DIR + if autoload or policy_dir: + policy_dir = policy_dir or LicenseOptions.POLICY_DIR or POLICY_DIR PRINT(f"Looking for custom policy {checker_name} in {policy_dir} ...") result = find_or_create_license_class(policy_name=checker_name, policy_dir=policy_dir) diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py index c07c06529..7cafdae37 100644 --- a/dcicutils/scripts/run_license_checker.py +++ b/dcicutils/scripts/run_license_checker.py @@ -31,9 +31,9 @@ def main(): parser.add_argument("--debug", '-q', default=False, action="store_true", help="Requests additional debugging output.") parser.add_argument("--conda-prefix", "--conda_prefix", "--cp", default=LicenseOptions.CONDA_PREFIX, - help=(f"Overrides the CONDA_PREFIX (default {LicenseOptions.CONDA_PREFIX!r}).")) + help=f"Overrides the CONDA_PREFIX (default {LicenseOptions.CONDA_PREFIX!r}).") parser.add_argument("--policy-dir", "--policy_dir", "--pd", default=LicenseOptions.POLICY_DIR, - help=(f"Specifies a custom policy directory (default {LicenseOptions.POLICY_DIR!r}).")) + help=f"Specifies a custom policy directory (default {LicenseOptions.POLICY_DIR!r}).") args = parser.parse_args() From d868a0518c67c0214b71e88834bd274d9da7d476 Mon Sep 17 00:00:00 2001 From: Kent Pitman Date: Fri, 29 Sep 2023 13:08:03 -0400 Subject: [PATCH 24/24] Make autoload default. It can be used instead to suppress autoload. --- dcicutils/license_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index ce01a5c80..ab9aa9d76 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -745,11 +745,11 @@ def find_checker(cls, checker_name: str) -> Optional[Type[LicenseChecker]]: return cls.REGISTRY.get(checker_name, None) @classmethod - def lookup_checker(cls, checker_name: str, autoload: bool = False, + def lookup_checker(cls, checker_name: str, autoload: bool = True, policy_dir: Optional[str] = None) -> Type[LicenseChecker]: result: Optional[Type[LicenseChecker]] = cls.find_checker(checker_name) if result is None: - if autoload or policy_dir: + if autoload: policy_dir = policy_dir or LicenseOptions.POLICY_DIR or POLICY_DIR PRINT(f"Looking for custom policy {checker_name} in {policy_dir} ...") result = find_or_create_license_class(policy_name=checker_name,