diff --git a/.gitignore b/.gitignore index 141fe1474..0731c2579 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,7 @@ ENV/ # PyCharm metadata .idea/ + +# Vi +*.swp +*.swo diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 886766943..b0c9245e0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ dcicutils Change Log ---------- -7.11.0 +7.14.0 ====== * New module ``transfer_utils``: @@ -14,6 +14,60 @@ Change Log * Creates new utilities for downloading files and patching a location back to the portal +7.13.0 +====== + +* In ``license_utils``: + + * Add an ``RLanguageFramework``. + + * Add various additional checker classes, and a registry to catalog them. Refactor so that pre-existing + classes better share information in an inherited way. + + +------------------------------------------+--------------------------------+----------------+ + | Class | Checker Name | Status | + +==========================================+================================+================+ + | ``ParkLabCommonLicenseChecker`` | ``park-lab-common`` | New | + +------------------------------------------+--------------------------------+----------------+ + | ``ParkLabGplPipelineLicenseChecker`` | ``park-lab-gpl-pipeline`` | New | + +------------------------------------------+--------------------------------+----------------+ + | ``ParkLabCommonServerLicenseChecker`` | ``park-lab-common-server`` | New | + +------------------------------------------+--------------------------------+----------------+ + | ``C4InfrastructureLicenseChecker`` | ``c4-infastructure`` | Refactored | + +------------------------------------------+--------------------------------+----------------+ + | ``C4PythonInfrastructureLicenseChecker`` | ``c4-python-infrastructure`` | Refactored | + +------------------------------------------+--------------------------------+----------------+ + | ``Scan2PipelineLicenseChecker`` | ``scan2-pipeline`` | New | + +------------------------------------------+--------------------------------+----------------+ + +* In ``misc_utils``: + + * New function ``json_file_contents`` + +* In ``scripts``: + + * Add a ``run-license-checker`` script, implemented by ``run_license_checker.py``, + that runs the license checker whose "checker name" is given as an argument. + + +7.12.0 +====== + +* In ``glacier_utils``: + + * Add functionality for KMS key encrypted accounts + + +7.11.0 +====== + +* In ``ff_utils``: + + * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` case needing a leading slash on the URL. + * Fix in ``get_schema`` and ``get_schemas`` for the ``portal_vapp`` returning webtest.response.TestResponse + which has a ``json`` object property rather than a function. + + 7.10.0 ====== diff --git a/dcicutils/common.py b/dcicutils/common.py index b4f487cf3..13d518455 100644 --- a/dcicutils/common.py +++ b/dcicutils/common.py @@ -1,4 +1,5 @@ import os +import re from typing import Dict, Union, Tuple, List, Any from typing_extensions import Literal @@ -36,6 +37,8 @@ LIBRARY_DIR = os.path.dirname(__file__) +Regexp = type(re.compile("sample")) + # ===== Auth Data ===== AuthStr = str diff --git a/dcicutils/ff_utils.py b/dcicutils/ff_utils.py index bbf5cdf85..907537b22 100644 --- a/dcicutils/ff_utils.py +++ b/dcicutils/ff_utils.py @@ -17,7 +17,7 @@ # S3BucketName, S3KeyName, ) from .lang_utils import disjoined_list -from .misc_utils import PRINT, to_camel_case, remove_suffix, VirtualApp +from .misc_utils import PRINT, to_camel_case, remove_suffix, VirtualApp, VirtualAppResponse # TODO (C4-92, C4-102): Probably to centralize this information in env_utils. Also figure out relation to CGAP. @@ -1006,7 +1006,7 @@ def get_schema(name, key=None, ff_env: Optional[str] = None, portal_env: Optiona base_url = f"profiles/{to_camel_case(name)}.json" add_on = 'frame=raw' if portal_vapp: - full_url = f"{base_url}?{add_on}" + full_url = f"/{base_url}?{add_on}" res = portal_vapp.get(full_url) return get_response_json(res) else: @@ -1038,7 +1038,7 @@ def get_schemas(key=None, ff_env: Optional[str] = None, *, allow_abstract: bool base_url = 'profiles/' add_on = 'frame=raw' if portal_vapp: - full_url = f"{base_url}?{add_on}" + full_url = f"/{base_url}?{add_on}" schemas: Dict[str, Dict] = portal_vapp.get(full_url) else: schemas: Dict[str, Dict] = get_metadata(obj_id=base_url, key=key, ff_env=portal_env, add_on=add_on) @@ -1504,7 +1504,10 @@ def get_response_json(res): it is not present. Used with the metadata functions. """ try: - res_json = res.json() + if isinstance(res, VirtualAppResponse): + res_json = res.json + else: + res_json = res.json() except Exception: raise Exception('Cannot get json for request to %s. Status' ' code: %s. Response text: %s' % diff --git a/dcicutils/glacier_utils.py b/dcicutils/glacier_utils.py index bbcf77893..7609ab316 100644 --- a/dcicutils/glacier_utils.py +++ b/dcicutils/glacier_utils.py @@ -58,6 +58,10 @@ def __init__(self, env_name: str): self.env_key = self.key_manager.get_keydict_for_env(env_name) self.health_page = get_health_page(key=self.env_key, ff_env=env_name) + @property + def kms_key_id(self) -> str: + return self.health_page.get("s3_encrypt_key_id", "") + @classmethod def is_glacier_storage_class(cls, storage_class: S3StorageClass): return storage_class in S3_GLACIER_CLASSES @@ -295,6 +299,9 @@ def _do_multipart_upload(self, bucket: str, key: str, total_size: int, part_size } if tags: cmu['Tagging'] = tags + if self.kms_key_id: + cmu['ServerSideEncryption'] = 'aws:kms' + cmu['SSEKMSKeyId'] = self.kms_key_id mpu = self.s3.create_multipart_upload(**cmu) mpu_upload_id = mpu['UploadId'] except Exception as e: @@ -381,16 +388,21 @@ def copy_object_back_to_original_location(self, bucket: str, key: str, storage_c else: # Force copy the object into standard in a single operation copy_source = {'Bucket': bucket, 'Key': key} - copy_target = { + copy_args = { 'Bucket': bucket, 'Key': key, 'StorageClass': storage_class, } if version_id: copy_source['VersionId'] = version_id - copy_target['CopySourceVersionId'] = version_id + copy_args['CopySourceVersionId'] = version_id if tags: - copy_target['Tagging'] = tags - response = self.s3.copy_object(CopySource=copy_source, **copy_target) + copy_args['Tagging'] = tags + if self.kms_key_id: + copy_args['ServerSideEncryption'] = 'aws:kms' + copy_args['SSEKMSKeyId'] = self.kms_key_id + response = self.s3.copy_object( + **copy_args, CopySource=copy_source + ) PRINT(f'Response from boto3 copy:\n{response}') PRINT(f'Object {bucket}/{key} copied back to its original location in S3') return response diff --git a/dcicutils/license_policies/c4-infrastructure.jsonc b/dcicutils/license_policies/c4-infrastructure.jsonc new file mode 100644 index 000000000..7a77448f6 --- /dev/null +++ b/dcicutils/license_policies/c4-infrastructure.jsonc @@ -0,0 +1,8 @@ +{ + "class_key": "c4-infrastructure", + "class_name": "C4InfrastructureLicenseChecker", + "inherits_from": ["park-lab-common-server"], + "description": "Checker for C4 infrastructure (Fourfront, CGAP, SMaHT) from Park Lab.", + + "LICENSE_TITLE": "(The )?MIT License" +} diff --git a/dcicutils/license_policies/c4-python-infrastructure.jsonc b/dcicutils/license_policies/c4-python-infrastructure.jsonc new file mode 100644 index 000000000..12a4afcf2 --- /dev/null +++ b/dcicutils/license_policies/c4-python-infrastructure.jsonc @@ -0,0 +1,8 @@ +{ + "class_key": "c4-python-infrastructure", + "class_name": "C4PythonInfrastructureLicenseChecker", + "inherits_from": ["c4-infrastructure"], + "description": "Checker for C4 python library infrastructure (Fourfront, CGAP, SMaHT) from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python"] +} diff --git a/dcicutils/license_policies/park-lab-common-server.jsonc b/dcicutils/license_policies/park-lab-common-server.jsonc new file mode 100644 index 000000000..72c1af930 --- /dev/null +++ b/dcicutils/license_policies/park-lab-common-server.jsonc @@ -0,0 +1,104 @@ +{ + "class_key": "park-lab-common-server", + "inherits_from": ["park-lab-common"], + "description": "Minimal/generic checker for servers from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python", "javascript"], + + "EXCEPTIONS": { + "BSD*": [ + // Although modified to insert the author name into the license text itself, + // the license for these libraries are essentially BSD-3-Clause. + "formatio", + "samsam", + + // There are some slightly different versions of what appear to be BSD licenses here, + // but clearly the license is permissive. + // Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme + "mutation-observer" + ], + "Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global": [ + // The use of this URL appears to be a syntax error in the definition of entries-ponyfill + // In fact this seems to be covered by a CC0-1.0 license. + // Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE + "object.entries-ponyfill" + ], + "Custom: https://github.com/saikocat/colorbrewer.": [ + // The use of this URL appears to be a syntax error in the definition of cartocolor + // In fact, this seems to be covered by a CC-BY-3.0 license. + // Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme + "cartocolor" + ], + "Custom: https://travis-ci.org/component/emitter.png": [ + // The use of this png appears to be a syntax error in the definition of emitter-component. + // In fact, emitter-component uses an MIT License + // Ref: https://www.npmjs.com/package/emitter-component + // Ref: https://github.com/component/emitter/blob/master/LICENSE + "emitter-component" + ], + "Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg": [ + // The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) + // seems to lack a license, but appears to be forked from the jsts library that uses + // the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive + // license is intended. + "turf-jsts" + ], + "GNU General Public License (GPL)": [ + "docutils" // Used only privately as a separate documentation-generation task for ReadTheDocs + ], + "GNU Library or Lesser General Public License (LGPL)": [ + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + // "GNU Lesser General Public License v3 or later (LGPLv3+)", + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "psycopg2", // Used at runtime during server operation, but not modified or distributed + "psycopg2-binary", // Used at runtime during server operation, but not modified or distributed + "chardet", // Potentially used downstream in loadxl to detect charset for text files + "pyzmq" // Used in post-deploy-perf-tests, not distributed, and not modified or distributed + ], + "GPL-2.0": [ + // The license file for the node-forge javascript library says: + // + // "You may use the Forge project under the terms of either the BSD License or the + // GNU General Public License (GPL) Version 2." + // + // (We choose to use it under the BSD license.) + // Ref: https://www.npmjs.com/package/node-forge?activeTab=code + "node-forge" + ], + "MIT*": [ + // This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. + // (It also mentions that some tools for building/testing use other libraries.) + // Ref: https://github.com/requirejs/domReady/blob/master/LICENSE + "domready", + + // This library is under "COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1" + // Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt + // About CDDL ... + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "jsonp", + + // This library says pretty clearly it intends MIT license. + // Ref: https://www.npmjs.com/package/component-indexof + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "component-indexof", + + // These look like a pretty straight MIT license. + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "mixin", // LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code + "stack-trace", // https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE + "typed-function" // LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code + ], + "UNLICENSED": [ + // The udn-browser library is our own and has been observed to sometimes show up in some contexts + // as UNLICENSED, when really it is MIT. + // Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE + "udn-browser" + ] + } +} diff --git a/dcicutils/license_policies/park-lab-common.jsonc b/dcicutils/license_policies/park-lab-common.jsonc new file mode 100644 index 000000000..e59d67aee --- /dev/null +++ b/dcicutils/license_policies/park-lab-common.jsonc @@ -0,0 +1,407 @@ +{ + "class_key": "park-lab-common", + "class_name": "ParkLabCommonLicenseChecker", + "inherits_from": [], + "description": "Minimal/generic checker common to all tech from Park Lab.", + + "COPYRIGHT_OWNER": "President and Fellows of Harvard College", + + "LICENSE_FRAMEWORKS": "ALL", + + "ALLOWED": [ + + // <> + // Ref: https://opensource.org/license/0bsd/ + "0BSD", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Academic Free License (AFL)", + "AFL-2.1", + + // Linking = Permissive, Private Use = Yes + // Apache licenses before version 2.0 are controversial, but we here construe an unmarked naming to imply + // any version, and hence v2. + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Apache Software License", + "Apache-Style", + {"pattern": "Apache([- ]2([.]0)?)?([- ]Licen[cs]e)?([- ]with[- ]LLVM[- ]exception)?"}, + // "Apache-2.0", + + // Artistic License 1.0 was confusing to people, so its status as permissive is in general uncertain, + // however the issue seems to revolve around point 8 (relating to whether or not perl is deliberately + // exposed). That isn't in play for our uses, so we don't flag it here. + // Artistic license 2.0 is a permissive license. + // Ref: https://en.wikipedia.org/wiki/Artistic_License + "Artistic-1.0-Perl", + {"pattern": "Artistic[- ]2([.]0)?"}, + + // According to Wikipedia, the Boost is considered permissive and BSD-like. + // Refs: + // * + // * https://en.wikipedia.org/wiki/Boost_(C%2B%2B_libraries)#License + {"pattern": "(BSL|Boost(([- ]Software)?[- ]License)?)([- ]1([.]0)?)?"}, + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + {"pattern": "((modified[- ])?[234][- ]Clause[- ])?BSD([- ][234][- ]Clause)?( Licen[cs]e)?"}, + // "BSD License", + // "BSD-2-Clause", + // "BSD-3-Clause", + // "BSD 3-Clause", + + // BZIP2 is a permissive license + // Ref: https://github.com/asimonov-im/bzip2/blob/master/LICENSE + {"pattern": "bzip2(-1[.0-9]*)"}, + + // Linking = Public Domain, Private Use = Public Domain + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CC0", + "CC0-1.0", + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CC-BY", + "CC-BY-3.0", + "CC-BY-4.0", + + // The curl license is a permissive license. + // Ref: https://curl.se/docs/copyright.html + "curl", + + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "CDDL", + + // The original Eclipse Distribution License 1.0 is essentially a BSD-3-Clause license. + // Ref: https://www.eclipse.org/org/documents/edl-v10.php + "Eclipse Distribution License", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Eclipse Public License", + "EPL-2.0", + + // The FSF Unlimited License (FSFUL) seems to be a completely permissive license. + // Refs: + // * https://spdx.org/licenses/FSFUL.html + // * https://fedoraproject.org/wiki/Licensing/FSF_Unlimited_License + "FSF Unlimited License", + "FSFUL", + + // The FreeType license is a permissive license. + // Ref: LicenseRef-FreeType + {"pattern": "(Licen[cs]eRef-)?(FTL|FreeType( Licen[cs]e)?)"}, + + // Linking = Yes, Cat = Permissive Software Licenses + // Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer + "Historical Permission Notice and Disclaimer (HPND)", + "HPND", + {"pattern": "(Licen[cs]eRef-)?PIL"}, + // The Pillow or Python Image Library is an HPND license, which is a simple permissive license: + // Refs: + // * https://github.com/python-pillow/Pillow/blob/main/LICENSE + // * https://www.fsf.org/blogs/licensing/historical-permission-notice-and-disclaimer-added-to-license-list + + // The IJG license, used by Independent JPEG Group (IJG) is a custom permissive license. + // Refs: + // * https://en.wikipedia.org/wiki/Libjpeg + // * https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/LICENSE.md + "IJG", + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "ISC License (ISCL)", + "ISC", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "MIT License", + "MIT", + + // Linking = Permissive, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Mozilla Public License 2.0 (MPL 2.0)", + "MPL-1.1", + "MPL-2.0", + + // The SIL Open Font License appears to be a copyleft-style license that applies narrowly + // to icons and not to the entire codebase. It is advertised as OK for use even in commercial + // applications. + // Ref: https://fontawesome.com/license/free + "OFL-1.1", + + // Ref: https://en.wikipedia.org/wiki/Public_domain + {"pattern": "(Licen[cs]eRef-)?Public[- ]Domain([- ]dedic[t]?ation)?"}, // "dedictation" is a typo in docutils + + // Linking = Permissive, Private Use = Permissive + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + {"pattern": "(Licen[cs]eRef-)?PSF-2([.][.0-9]*)"}, + "Python Software Foundation License", + "Python-2.0", + + // License = BSD-like + // Ref: https://en.wikipedia.org/wiki/Pylons_project + "Repoze Public License", + + // The TCL or Tcl/Tk licenses are permissive licenses. + // Ref: https://www.tcl.tk/software/tcltk/license.html + // The one used by the tktable library has a "bourbon" clause that doesn't add compliance requirements + // Ref: https://github.com/wjoye/tktable/blob/master/license.txt + {"pattern": "Tcl([/]tk)?"}, + + // The Ubuntu Font Licence is mostly permissive. It contains some restrictions if you are going to modify the + // fonts that require you to change the name to avoid confusion. But for our purposes, we're assuming that's + // not done, and so we're not flagging it. + {"pattern": "Ubuntu Font Licen[cs]e Version( 1([.]0)?)?"}, + + // Linking = Permissive/Public domain, Private Use = Permissive/Public domain + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "The Unlicense (Unlicense)", + "Unlicense", + + // Various licenses seem to call themselves or be summed up as unlimited. + // So far we know of none that are not highly permissive. + // * boot and KernSmooth are reported by R as being "Unlimited" + // Refs: + // * https://cran.r-project.org/web/packages/KernSmooth/index.html + // (https://github.com/cran/KernSmooth/blob/master/LICENCE.note) + // * https://cran.r-project.org/package=boot + // (https://github.com/cran/boot/blob/master/DESCRIPTION) + "Unlimited", + + // Linking = Permissive, Private Use = ? + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "W3C License", + "W3C-20150513", + + // Linking = Permissive/Public Domain, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "WTFPL", + + // Copyleft = No + // Ref: https://en.wikipedia.org/wiki/Zlib_License + // Linking = Permissive, Private Use = ? (for zlib/libpng license) + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "Zlib", + + // Copyleft = No, FSF/OSI-approved: Yes + // Ref: https://en.wikipedia.org/wiki/Zope_Public_License + "Zope Public License" + ], + + "EXCEPTIONS": { + + // The Bioconductor zlibbioc license is a permissive license. + // Ref: https://github.com/Bioconductor/zlibbioc/blob/devel/LICENSE + "Custom: bioconductor-zlibbioc file LICENSE": [ + "bioconductor-zlibbioc" + ], + + // The Bioconductor rsamtools license is an MIT license + // Ref: https://bioconductor.org/packages/release/bioc/licenses/Rsamtools/LICENSE + "Custom: bioconductor-rsamtools file LICENSE": [ + "bioconductor-rsamtools" + ], + + // DFSG = Debian Free Software Guidelines + // Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines + // Used as an apparent modifier to other licenses, to say they are approved per Debian. + // For example in this case, pytest-timeout has license: DFSG approved, MIT License, + // but is really just an MIT License that someone has checked is DFSG approved. + "DFSG approved": [ + "pytest-timeout" // MIT Licensed + ], + + "FOSS": [ + // The r-stringi library is a conda library that implements a stringi (pronounced "stringy") library for R. + // The COnda source feed is: https://github.com/conda-forge/r-stringi-feedstock + // This page explains that the home source is https://stringi.gagolewski.com/ but that's a doc page. + // The doc page says: + // > stringi’s source code is hosted on GitHub. + // > It is distributed under the open source BSD-3-clause license. + // The source code has a license that begins with a BSD-3-clause license and includes numerous others, + // but they all appear to be permissive. + // Ref: https://github.com/gagolews/stringi/blob/master/LICENSE + "stringi", + "r-stringi" + ], + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GNU Lesser General Public License v2 or later (LGPLv2+)": [ + "chardet" // used at runtime during server operation (ingestion), but not modified or distributed + ], + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GNU Lesser General Public License v3 or later (LGPLv3+)": [ + // used only privately in testing, not used in server code, not modified, not distributed + "pytest-redis", + // required by pytest-redis (used only where it's used) + "mirakuru" + ], + + "GNU General Public License (GPL)": [ + "docutils" // Used only privately as a separate documentation-generation task for ReadTheDocs + ], + + "MIT/X11 Derivative": [ + // The license used by libxkbcommon is complicated and involves numerous included licenses, + // but all are permissive. + // Ref: https://github.com/xkbcommon/libxkbcommon/blob/master/LICENSE + "libxkbcommon" + ], + + "None": [ + // It's not obvious why Conda shows this license as "None". + // In fact, though, BSD 3-Clause "New" or "Revised" License + // Ref: https://github.com/AnacondaRecipes/_libgcc_mutex-feedstock/blob/master/LICENSE.txt + "_libgcc_mutex" + ], + + "PostgreSQL": [ + // The libpq library is actually licensed with a permissive BSD 3-Clause "New" or "Revised" License + // Ref: https://github.com/lpsmith/postgresql-libpq/blob/master/LICENSE + "libpq" + ], + + "UCSD": [ + // It isn't obvious why these show up with a UCSD license in Conda. + // The actual sources say it should be a 2-clause BSD license: + // Refs: + // * https://github.com/AlexandrovLab/SigProfilerMatrixGenerator/blob/master/LICENSE + // * https://github.com/AlexandrovLab/SigProfilerPlotting/blob/master/LICENSE + "sigprofilermatrixgenerator", + "sigprofilerplotting" + ], + + "X11": [ + // The ncurses library has a VERY complicated history, BUT seems consistently permissive + // and the most recent version seems to be essentially the MIT license. + // Refs: + // * https://en.wikipedia.org/wiki/Ncurses#License + // * https://invisible-island.net/ncurses/ncurses-license.html + "ncurses" + ], + + "zlib-acknowledgement": [ + // It isn't clear whey libpng shows up with this license name, but the license for libpng + // is a permissive license. + // Ref: https://github.com/glennrp/libpng/blob/libpng16/LICENSE + "libpng" + ] + }, + + "EXPECTED_MISSING_LICENSES": [ + + // This is a name we use for our C4 portals. And it isn't published. + // We inherited the name from the Stanford ENCODE group, which had an MIT-licensed repo we forked + "encoded", // cgap-portal, fourfront, and smaht-portal all call themselves this + + // We believe that since these next here are part of the Pylons project, they're covered under + // the same license as the other Pylons projects. We're seeking clarification. + "pyramid-translogger", + "subprocess-middleware", + + // This appears to be a BSD 2-Clause "Simplified" License, according to GitHub. + // PyPi also says it's a BSD license. + // Ref: https://github.com/paulc/dnslib/blob/master/LICENSE + "dnslib", + + // This says it wants an ISC License, which we already have approval for but just isn't showing up. + // Ref: https://github.com/rthalley/dnspython/blob/master/LICENSE + "dnspython", + + // This appears to be a mostly-MIT-style license. + // There are references to parts being in the public domain, though it's not obvious if that's meaningful. + // It's probably sufficient for our purposes to treat this as a permissive license. + // Ref: https://github.com/tlsfuzzer/python-ecdsa/blob/master/LICENSE + "ecdsa", + + // This has an MIT license in its source repository + // Ref: https://github.com/xlwings/jsondiff/blob/master/LICENSE + "jsondiff", + + // This has an MIT license in its source repository + // Ref: https://github.com/pkerpedjiev/negspy/blob/master/LICENSE + "negspy", + + // This license statement is complicated, but seems adequately permissive. + // Ref: https://foss.heptapod.net/python-libs/passlib/-/blob/branch/stable/LICENSE + "passlib", + + // This seems to be a BSD-3-Clause license. + // Ref: https://github.com/protocolbuffers/protobuf/blob/main/LICENSE + // pypi agrees in the Meta section of protobuf's page, where it says "3-Clause BSD License" + // Ref: https://pypi.org/project/protobuf/ + "protobuf", + + // The WTFPL license is permissive. + // Ref: https://github.com/mk-fg/pretty-yaml/blob/master/COPYING + "pyaml", + + // This uses a BSD license + // Ref: https://github.com/eliben/pycparser/blob/master/LICENSE + "pycparser", + + // The source repo for pyDes says this is under an MIT license + // Ref: https://github.com/twhiteman/pyDes/blob/master/LICENSE.txt + // pypi, probably wrongly, thinks this is in the public domain (as of 2023-07-21) + // Ref: https://pypi.org/project/pyDes/ + "pyDes", + + // This uses an MIT license + // Ref: https://github.com/pysam-developers/pysam/blob/master/COPYING + "pysam", + + // The version of python-lambda that we forked calls itself this (and publishes at pypi under this name) + "python-lambda-4dn", + + // This is MIT-licensed: + // Ref: https://github.com/themiurgo/ratelim/blob/master/LICENSE + // pypi agrees + // Ref: https://pypi.org/project/ratelim/ + "ratelim", + + // This is a BSD-3-Clause-Modification license + // Ref: https://github.com/repoze/repoze.debug/blob/master/LICENSE.txt + "repoze.debug", + + // This is an Apache-2.0 license + // Ref: https://github.com/getsentry/responses/blob/master/LICENSE + "responses", + + // This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault + // calls itself internally. In any case, it's under MIT license and OK. + // Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt + "snovault", + + // PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" + // Ref: https://pypi.org/project/supervisor/ + // In fact, though, the license is a bit more complicated, though apparently still permissive. + // Ref: https://github.com/Supervisor/supervisor/blob/main/LICENSES.txt + "supervisor", + + // This seems to be a BSD-3-Clause-Modification license. + // Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt + "translationstring", + + // This seems to be a BSD-3-Clause-Modification license. + // Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt + "venusian", + + // PyPi identifies zope.deprecation as using the "Zope Public License (ZPL 2.1)" license. + // Ref: https://github.com/zopefoundation/Zope/blob/master/LICENSE.txt + "zope.deprecation" + + // Below are licenses last known to have licenses missing in pip-licenses and need to be investigated further. + // Note well that just because pip-licenses doesn't know the license doesn't mean the software has + // no license. It may just mean the library is poorly registered in pypi. Some licenses have to be + // found by looking at the library's documentation or source files. + + // (all of these have been classified at this point) + ] +} diff --git a/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc b/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc new file mode 100644 index 000000000..1ff0b2723 --- /dev/null +++ b/dcicutils/license_policies/park-lab-gpl-pipeline.jsonc @@ -0,0 +1,62 @@ +{ + "class_key": "park-lab-gpl-pipeline", + "class_name": "ParkLabGplPipelineLicenseChecker", + "inherits_from": ["park-lab-pipeline"], + "description": "Minimal/generic checker for GPL-approved pipelines from Park Lab.", + + "ALLOWED": [ + + // Linking = With Restrictions, Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + // The "exceptions", if present, indicate waivers to source delivery requirements. + // Ref: https://spdx.org/licenses/LGPL-3.0-linking-exception.html + {"pattern": "GNU Lesser General Public License v2( or later)?( [(]LGPL[v]?[23][+]?[)])?"}, + // "GNU Lesser General Public License v2 or later (LGPLv2+)", + // "GNU Lesser General Public License v3 or later (LGPLv3+)", + // "LGPLv2", "LGPL-v2", "LGPL-v2.0", "LGPL-2", "LGPL-2.0", + // "LGPLv2+", "LGPL-v2+", "LGPL-v2.0+", "LGPL-2+", "LGPL-2.0+", + // "LGPLv3", "LGPL-v3", "LGPL-v3.0", "LGPL-3", "LGPL-3.0", + // "LGPLv3+", "LGPL-v3+", "LGPL-v3.0+", "LGPL-3+", "LGPL-3.0+", + {"pattern": "LGPL[v-]?[.0-9]*([+]|-only)?([- ]with[- ]exceptions)?"}, + + // Uncertain whether this is LGPL 2 or 3, but in any case we think weak copyleft should be OK + // for pipeline or server use as long as we"re not distributing sources. + "LGPL", + "GNU Library or Lesser General Public License (LGPL)", + + // GPL + // * library exception operates like LGPL + // * classpath exception is a linking exception related to Oracle + // Refs: + // * https://www.gnu.org/licenses/old-licenses/gpl-1.0.en.html + // * https://spdx.org/licenses/GPL-2.0-with-GCC-exception.html + // * https://spdx.org/licenses/GPL-3.0-with-GCC-exception.html + { + "pattern": [ + "(GNU General Public License|GPL)[ ]?[v-]?[123]([.]0)?([+]|[- ]only)?", + "([- ]with[- ]GCC(([- ]runtime)?[- ]library)?[- ]exception([- ][.0-9]*)?)?", + "([- ]with[- ]Classpath[- ]exception([- ][.0-9]+)?)?" + ] + }, + + // Linking = "GPLv3 compatible only", Private Use = Yes + // Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses + "GPL-2-or-3", // we sometimes generate this token + // "GPLv2+", "GPL-v2+", "GPL-v2.0+", "GPL-2+", "GPL-2.0+", + // "GPLv3", "GPL-v3", "GPL-v3.0", "GPL-3", "GPL-3.0", + // "GPLv3+", "GPL-v3+", "GPL-v3.0+", "GPL-3+", "GPL-3.0+", + // "GPLv3-only", "GPL-3-only", "GPL-v3-only", "GPL-3.0-only", "GPL-v3.0-only", + + // Uncertain whether this is GPL 2 or 3, but we'll assume that means we can use either. + // And version 3 is our preferred interpretation. + "GNU General Public License", + "GPL", + + // This is an arbitrary catch-all name we made up because the R language some things identify themselves + // as a specific part of the R language + // Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Legalese + // An important clarification to this is here: + // Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Can-I-use-R-for-commercial-purposes_003f + "R-language-license" + ] +} diff --git a/dcicutils/license_policies/park-lab-pipeline.jsonc b/dcicutils/license_policies/park-lab-pipeline.jsonc new file mode 100644 index 000000000..5fbcc6616 --- /dev/null +++ b/dcicutils/license_policies/park-lab-pipeline.jsonc @@ -0,0 +1,12 @@ +{ + "class_key": "park-lab-pipeline", + "class_name": "ParkLabPipelineLicenseChecker", + "inherits_from": ["park-lab-common"], + "description": "Minimal/generic checker for non-GPL-approved pipelines from Park Lab.", + + "LICENSE_FRAMEWORKS": ["python", "conda", "r"] +} + + + + diff --git a/dcicutils/license_utils.py b/dcicutils/license_utils.py index db18fd7df..ab9aa9d76 100644 --- a/dcicutils/license_utils.py +++ b/dcicutils/license_utils.py @@ -1,8 +1,9 @@ import contextlib +import csv import datetime +import glob import io import json -# import logging import os import re import subprocess @@ -23,13 +24,20 @@ # import piplicenses from collections import defaultdict -from typing import Any, Dict, DefaultDict, List, Optional, Type, Union +from jsonc_parser.parser import JsoncParser +from typing import Any, Dict, DefaultDict, List, Optional, Type, TypeVar, Union # For obscure reasons related to how this file is used for early prototyping, these must use absolute references # to modules, not relative references. Later when things are better installed, we can make refs relative again. -from dcicutils.lang_utils import there_are -from dcicutils.misc_utils import PRINT, get_error_message, local_attrs +from dcicutils.common import Regexp, AnyJsonData +from dcicutils.exceptions import InvalidParameterError +from dcicutils.lang_utils import there_are, conjoined_list +from dcicutils.misc_utils import ( + PRINT, get_error_message, ignorable, ignored, json_file_contents, local_attrs, environ_bool, + remove_suffix, to_camel_case +) +T = TypeVar("T") # logging.basicConfig() # logger = logging.getLogger(__name__) @@ -42,6 +50,18 @@ _NAME = 'name' _STATUS = 'status' +_INHERITS_FROM = 'inherits_from' +_ALLOWED = 'allowed' +_EXCEPT = 'except' + + +def pattern(x): + return re.compile(x, re.IGNORECASE) + + +def augment(d: dict, by: dict): + return dict(d, **by) + class LicenseStatus: ALLOWED = "ALLOWED" @@ -51,6 +71,24 @@ class LicenseStatus: UNEXPECTED_MISSING = "UNEXPECTED_MISSING" +class LicenseOptions: + # General verbosity, such as progress information + VERBOSE = environ_bool("LICENSE_UTILS_VERBOSE", default=True) + # Specific additional debugging output + DEBUG = environ_bool("LICENSE_UTILS_DEBUG", default=False) + CONDA_PREFIX = os.environ.get("CONDA_LICENSE_CHECKER_PREFIX", os.environ.get("CONDA_PREFIX", "")) + POLICY_DIR = os.environ.get("LICENSE_UTILS_POLICY_DIR") + + @classmethod + @contextlib.contextmanager + def selected_options(cls, verbose=VERBOSE, debug=DEBUG, conda_prefix=CONDA_PREFIX, policy_dir=POLICY_DIR): + """ + Allows a script, for example, to specify overrides for these options dynamically. + """ + with local_attrs(cls, VERBOSE=verbose, DEBUG=debug, CONDA_PREFIX=conda_prefix, POLICY_DIR=policy_dir): + yield + + class LicenseFramework: NAME = None @@ -87,13 +125,13 @@ def temporary_registration_for_testing(cls): yield @classmethod - def register(cls, *, name): + def register_framework(cls, *, name): """ Declares a python license framework classs. Mostly these names will be language names like 'python' or 'javascript', but they might be names of other, non-linguistic frameworks (like 'cgap-pipeline', for example). """ - def _decorator(framework_class): + def _decorator(framework_class: T) -> T: if not issubclass(framework_class, LicenseFramework): raise ValueError(f"The class {framework_class.__name__} does not inherit from LicenseFramework.") framework_class.NAME = name @@ -116,26 +154,113 @@ def find_framework(cls, framework_spec: FrameworkSpec): def all_frameworks(cls): return sorted(cls.LICENSE_FRAMEWORKS.values(), key=lambda x: x.NAME) - -@LicenseFrameworkRegistry.register(name='javascript') + @classmethod + def all_framework_names(cls): + return sorted(cls.LICENSE_FRAMEWORKS.keys()) + + +# This is intended to match ' (= 3)', ' (>= 3)', ' (version 3)', ' (version 3 or greater)' +# It will incidentally and harmlessly also take ' (>version 3)' or '(>= 3 or greater)'. +# It will also correctly handle the unlikely case of ' (= 3 or greater)' + +_OR_LATER_PATTERN = '(?:[- ]or[ -](?:greater|later))' +_PARENTHETICAL_VERSION_CONSTRAINT = re.compile(f'( [(]([>]?)(?:[=]|version) ([0-9.]+)({_OR_LATER_PATTERN}?)[)])') +_POSTFIX_OR_LATER_PATTERN = re.compile(f"({_OR_LATER_PATTERN})") +_GPL_VERSION_CHOICE = re.compile('^GPL-v?([0-9.+]) (?:OR|[|]) GPL-v?([0-9.+])$') + + +def simplify_license_versions(licenses_spec: str, *, for_package_name) -> str: + m = _GPL_VERSION_CHOICE.match(licenses_spec) + if m: + version_a, version_b = m.groups() + return f"GPL-{version_a}-or-{version_b}" + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + transform_count = 0 + original_licenses_spec = licenses_spec + ignorable(original_licenses_spec) # sometimes useful for debugging + while True: + if transform_count > 100: # It'd be surprising if there were even ten of these to convert. + warnings.warn(f"Transforming {for_package_name} {licenses_spec!r} seemed to be looping." + f" Please report this as a bug.") + return licenses_spec # return the unmodified + transform_count += 1 + m = _PARENTHETICAL_VERSION_CONSTRAINT.search(licenses_spec) + if not m: + break + matched, greater, version_spec, greater2 = m.groups() + is_greater = bool(greater or greater2) + licenses_spec = licenses_spec.replace(matched, + f"-{version_spec}" + f"{'+' if is_greater else ''}") + transform_count = 0 + while True: + if transform_count > 100: # It'd be surprising if there were even ten of these to convert. + warnings.warn(f"Transforming {for_package_name} {licenses_spec!r} seemed to be looping." + f" Please report this as a bug.") + return licenses_spec # return the unmodified + transform_count += 1 + m = _POSTFIX_OR_LATER_PATTERN.search(licenses_spec) + if not m: + break + matched = m.group(1) + licenses_spec = licenses_spec.replace(matched, '+') + if LicenseOptions.DEBUG and licenses_spec != original_licenses_spec: + PRINT(f"Rewriting {original_licenses_spec!r} as {licenses_spec!r}.") + return licenses_spec + + +def extract_boolean_terms(boolean_expression: str, for_package_name: str) -> List[str]: + # We only care which licenses were mentioned, not what algebra is used on them. + # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) + # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". + # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will + # have trouble passing unless both MIT and GPL-3.0 are allowed. + revised_boolean_expression = ( + boolean_expression + .replace('(', '') + .replace(')', '') + .replace(' AND ', ',') + .replace(' and ', ',') + .replace(' & ', ',') + .replace(' OR ', ',') + .replace(' or ', ',') + .replace('|', ',') + .replace(';', ',') + .replace(' + ', ',') + .replace('file ', f'Custom: {for_package_name} file ') + ) + terms = [x for x in sorted(map(lambda x: x.strip(), revised_boolean_expression.split(','))) if x] + if LicenseOptions.DEBUG and revised_boolean_expression != boolean_expression: + PRINT(f"Rewriting {boolean_expression!r} as {terms!r}.") + return terms + + +@LicenseFrameworkRegistry.register_framework(name='javascript') class JavascriptLicenseFramework(LicenseFramework): @classmethod - def implicated_licenses(cls, *, licenses_spec: str): - # We only care which licenses were mentioned, not what algebra is used on them. - # (Thankfully there are no NOTs, and that's probably not by accident, since that would be too big a set.) - # So for us, either (FOO AND BAR) or (FOO OR BAR) is the same because we want to treat it as "FOO,BAR". - # If all of those licenses match, all is good. That _does_ mean some things like (MIT OR GPL-3.0) will - # have trouble passing unless both MIT and GPL-3.0 are allowed. - licenses = sorted(map(lambda x: x.strip(), - (licenses_spec - .replace('(', '') - .replace(')', '') - .replace(' AND ', ',') - .replace(' OR ', ',') - ).split(','))) + def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: + ignored(package_name) + licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) + licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) return licenses + VERSION_PATTERN = re.compile('^.+?([@][0-9.][^@]*|)$') + + @classmethod + def strip_version(cls, raw_name): + name = raw_name + m = cls.VERSION_PATTERN.match(raw_name) # e.g., @foo/bar@3.7 + if m: + suffix = m.group(1) + if suffix: + name = remove_suffix(m.group(1), name) + return name + @classmethod def get_dependencies(cls): output = subprocess.check_output(['npx', 'license-checker', '--summary', '--json'], @@ -147,24 +272,20 @@ def get_dependencies(cls): # e.g., this happens if there's no javascript in the repo raise Exception("No javascript license data was found.") result = [] - for name, record in records.items(): - licenses_spec = record.get(_LICENSES) - if '(' in licenses_spec: - licenses = cls.implicated_licenses(licenses_spec=licenses_spec) - PRINT(f"Rewriting {licenses_spec!r} as {licenses!r}") - elif licenses_spec: - licenses = [licenses_spec] - else: - licenses = [] + for raw_name, record in records.items(): + name = cls.strip_version(raw_name) + raw_licenses_spec = record.get(_LICENSES) + licenses = cls.implicated_licenses(licenses_spec=raw_licenses_spec, package_name=name) entry = { - _NAME: name.lstrip('@').split('@')[0], # e.g., @foo/bar@3.7 - _LICENSES: licenses # TODO: could parse this better. + _NAME: name, + _LICENSES: licenses, + _FRAMEWORK: 'javascript' } result.append(entry) return result -@LicenseFrameworkRegistry.register(name='python') +@LicenseFrameworkRegistry.register_framework(name='python') class PythonLicenseFramework(LicenseFramework): @classmethod @@ -184,15 +305,102 @@ def get_dependencies(cls): entry = { _NAME: license_name, _LICENSES: licenses, - _LANGUAGE: 'python', + _FRAMEWORK: 'python', } result.append(entry) return sorted(result, key=lambda x: x.get(_NAME).lower()) -class LicenseFileParser: +@LicenseFrameworkRegistry.register_framework(name='conda') +class CondaLicenseFramework(LicenseFramework): - VERBOSE = False + @classmethod + def get_dependencies(cls): + prefix = LicenseOptions.CONDA_PREFIX + result = [] + filespec = os.path.join(prefix, "conda-meta/*.json") + files = glob.glob(filespec) + for file in files: + data = json_file_contents(file) + package_name = data['name'] + package_license = data.get('license') or "MISSING" + if package_license: + simplified_package_license_spec = simplify_license_versions(package_license, + for_package_name=package_name) + package_licenses = extract_boolean_terms(simplified_package_license_spec, + for_package_name=package_name) + else: + package_licenses = [] + entry = { + _NAME: package_name, + _LICENSES: package_licenses, + _FRAMEWORK: 'conda', + } + result.append(entry) + result.sort(key=lambda x: x['name']) + return result + + +@LicenseFrameworkRegistry.register_framework(name='r') +class RLicenseFramework(LicenseFramework): + + R_PART_SPEC = re.compile("^Part of R [0-9.]+$") + R_LANGUAGE_LICENSE_NAME = 'R-language-license' + + @classmethod + def implicated_licenses(cls, *, package_name, licenses_spec: str) -> List[str]: + if cls.R_PART_SPEC.match(licenses_spec): + return [cls.R_LANGUAGE_LICENSE_NAME] + licenses_spec = simplify_license_versions(licenses_spec, for_package_name=package_name) + licenses = extract_boolean_terms(licenses_spec, for_package_name=package_name) + return licenses + + @classmethod + def get_dependencies(cls): + # NOTE: Although the R Language itself is released under the GPL, our belief is that it is + # still possible to write programs in R that are not GPL, even programs that use commercial licenses. + # So we do ordinary license checking here, same as in other frameworks. + # For notes on this, see the R FAQ. + # Ref: https://cran.r-project.org/doc/FAQ/R-FAQ.html#Can-I-use-R-for-commercial-purposes_003f + + _PACKAGE = "Package" + _LICENSE = "License" + + found_problems = 0 + + output_bytes = subprocess.check_output(['r', '--no-echo', '-q', '-e', + f'write.csv(installed.packages()[,c("Package", "License")])'], + # This will output to stderr if there's an error, + # but it will still put {} on stdout, which is good enough for us. + stderr=subprocess.DEVNULL) + output = output_bytes.decode('utf-8') + result = [] + first_line = True + for entry in csv.reader(io.StringIO(output)): # [ignore, package, license] + if first_line: + first_line = False + if entry == ["", _PACKAGE, _LICENSE]: # we expect headers + continue + try: + package_name = entry[1] + licenses_spec = entry[2] + licenses = cls.implicated_licenses(package_name=package_name, licenses_spec=licenses_spec) + entry = { + _NAME: package_name, + _LICENSES: licenses, + _FRAMEWORK: 'r', + } + result.append(entry) + except Exception as e: + found_problems += 1 + if LicenseOptions.VERBOSE: + PRINT(get_error_message(e)) + if found_problems > 0: + warnings.warn(there_are(found_problems, kind="problem", show=False, punctuate=True, tense='past')) + return sorted(result, key=lambda x: x.get(_NAME).lower()) + + +class LicenseFileParser: SEPARATORS = '-.,' SEPARATORS_AND_WHITESPACE = SEPARATORS + ' \t' @@ -230,8 +438,6 @@ def parse_simple_license_file(cls, *, filename): lines = [] for i, line in enumerate(fp): line = line.strip(' \t\n\r') - if cls.VERBOSE: # pragma: no cover - this is just for debugging - PRINT(str(i).rjust(3), line) m = cls.COPYRIGHT_LINE.match(line) if line[:1].isupper() else None if not m: lines.append(line) @@ -291,22 +497,31 @@ def report(message): class LicenseChecker: """ - There are three important class variables to specify: + License checkers are defined as .jsonc. The JSONC file format is JSON with Comments. + (The comments are Javascript syntax, either '//' or '/* ... */'.) + + There are these important class variables to specify: LICENSE_TITLE is a string naming the license to be expected in LICENSE.txt COPYRIGHT_OWNER is the name of the copyright owner. - FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], but can be limited to - just ['python'] for example. It doesn't make a lot of sense to limit it to ['javascript'], though you could, - since you are using a Python library to do this, and it probably needs to have its dependencies checked. + LICENSE_FRAMEWORKS will default to all defined frameworks (presently ['python', 'javascript'], + but can be limited to just ['python'] for example. It doesn't make a lot of sense to limit it to + ['javascript'], though you could, since you are using a Python library to do this, and it probably + needs to have its dependencies checked. - ALLOWED is a list of license names as returned by the pip-licenses library. + ALLOWED is a list of license names as returned by the various license frameworks. Because they rely on different + underlying tools the exact format of the names that result might vary. For this reason, there is a regular + expression capability for this particular attribute. In addition to just a string, you can also use + {"pattern": ""} For very long regular expressions, {"pattern": ["", ...]} will + concatenate all the parts into a single regexp, so they can be gracefully broken over lines in the .jsonc + source file. If regexp flags are requierd, use {"pattern" "", "flags": ["flag1", ...]}. - EXPECTED_MISSING is a list of libraries that are expected to have no license information. This is so you don't - have to get warning fatigue by seeing a warning over and over for things you know about. If a new library - with no license info shows up that you don't expect, you should investigate it, make sure it's OK, - and then add it to this list. + EXPECTED_MISSING_LICENSES is a list of libraries that are expected to have no license information. + This is so you don't have to get warning fatigue by seeing a warning over and over for things you know about. + If a new library with no license info shows up that you don't expect, you should investigate it, + make sure it's OK, and then add it to this list. EXCEPTIONS is a table (a dict) keyed on license names with entries that are lists of library names that are allowed to use the indicated license even though the license might not be generally allowed. This should be @@ -316,14 +531,12 @@ class LicenseChecker: Note that if you don't like these license names, which are admittedly non-standard and do nt seem to use SPDX naming conventions, you can customize the get_dependencies method to return a different list, one of the form - [{"name": "libname", "license_classifier": ["license1", "license2", ...], "language": "python"}] + [{"name": "libname", "license_classifier": ["license1", "license2", ...], "framework": "python"}] by whatever means you like and using whatever names you like. """ # Set this to True in subclasses if you want your organization's policy to be that you see # some visible proof of which licenses were checked. - VERBOSE = True - LICENSE_TITLE = None COPYRIGHT_OWNER = None LICENSE_FRAMEWORKS = None @@ -378,6 +591,22 @@ def analyze_license_file(cls, *, analysis: LicenseAnalysis, check_license_title=license_title or cls.LICENSE_TITLE, analysis=analysis) + CHOICE_REGEXPS = {} + + @classmethod + def _make_regexp_for_choices(cls, choices): + inner_pattern = '|'.join('^' + (re.escape(choice) if isinstance(choice, str) else choice.pattern) + '$' + for choice in choices) or "^$" + return re.compile(f"({inner_pattern})", re.IGNORECASE) + + @classmethod + def _find_regexp_for_choices(cls, choices): + key = str(choices) + regexp = cls.CHOICE_REGEXPS.get(key) + if not regexp: + cls.CHOICE_REGEXPS[key] = regexp = cls._make_regexp_for_choices(choices) + return regexp + @classmethod def analyze_license_dependencies_for_framework(cls, *, analysis: LicenseAnalysis, @@ -385,7 +614,7 @@ def analyze_license_dependencies_for_framework(cls, *, acceptable: Optional[List[str]] = None, exceptions: Optional[Dict[str, str]] = None, ) -> None: - acceptable = (acceptable or []) + (cls.ALLOWED or []) + acceptability_regexp = cls._find_regexp_for_choices((acceptable or []) + (cls.ALLOWED or [])) exceptions = dict(cls.EXCEPTIONS or {}, **(exceptions or {})) try: @@ -415,7 +644,7 @@ def analyze_license_dependencies_for_framework(cls, *, by_special_exception = False for license_name in license_names: special_exceptions = exceptions.get(license_name, []) - if license_name in acceptable: + if acceptability_regexp.match(license_name): # license_name in acceptable: pass elif name in special_exceptions: by_special_exception = True @@ -430,7 +659,7 @@ def analyze_license_dependencies_for_framework(cls, *, _LICENSES: license_names, _STATUS: status }) - if cls.VERBOSE: # pragma: no cover - this is just for debugging + if LicenseOptions.VERBOSE: # pragma: no cover - this is just for debugging PRINT(f"Checked {framework.NAME} {name}:" f" {'; '.join(license_names) if license_names else '---'} ({status})") @@ -458,8 +687,9 @@ def analyze_license_dependencies_by_framework(cls, *, @classmethod def show_unacceptable_licenses(cls, *, analysis: LicenseAnalysis) -> LicenseAnalysis: if analysis.unacceptable: + # This is part of the essential output, so is not conditional on switches. PRINT(there_are(analysis.unacceptable, kind="unacceptable license", show=False, punctuation_mark=':')) - for license, names in analysis.unacceptable.items(): + for license, names in sorted(analysis.unacceptable.items()): PRINT(f" {license}: {', '.join(names)}") return analysis @@ -499,6 +729,42 @@ class MyOrgLicenseChecker(LicenseChecker): raise LicenseAcceptabilityCheckFailure(unacceptable_licenses=analysis.unacceptable) +class LicenseCheckerRegistry: + + REGISTRY: Dict[str, Type[LicenseChecker]] = {} + + @classmethod + def register_checker(cls, name: str): + def _register(license_checker_class: Type[LicenseChecker]): + cls.REGISTRY[name] = license_checker_class + return license_checker_class + return _register + + @classmethod + def find_checker(cls, checker_name: str) -> Optional[Type[LicenseChecker]]: + return cls.REGISTRY.get(checker_name, None) + + @classmethod + def lookup_checker(cls, checker_name: str, autoload: bool = True, + policy_dir: Optional[str] = None) -> Type[LicenseChecker]: + result: Optional[Type[LicenseChecker]] = cls.find_checker(checker_name) + if result is None: + if autoload: + policy_dir = policy_dir or LicenseOptions.POLICY_DIR or POLICY_DIR + PRINT(f"Looking for custom policy {checker_name} in {policy_dir} ...") + result = find_or_create_license_class(policy_name=checker_name, + policy_dir=policy_dir) + if result: + return result + raise InvalidParameterError(parameter='checker_name', value=checker_name, + options=cls.all_checker_names()) + return result + + @classmethod + def all_checker_names(cls): + return list(cls.REGISTRY.keys()) + + class LicenseCheckFailure(Exception): DEFAULT_MESSAGE = "License check failure." @@ -523,375 +789,243 @@ def __init__(self, message=None, unacceptable_licenses=None): super().__init__(message=message) -class C4InfrastructureLicenseChecker(LicenseChecker): +def literal_string_or_regexp_from_dict(item): + """ + Expects either a string (which will be matched using ordinary equality) ore a regular expression, + expressed as a dictionary of the form {"pattern": , "flags": [, ...]} + The pattern is required. The flags may be omitted if null. + A pattern is either a string or a list of strings. If it is a list of strings, it will be concatenated + into a single string, which can be useful for breaking long strings over lines. + Flags are string names of re.WHATEVER flags that would be given to Python's re.compile. + UNICODE and IGNORECASE are on by default. + """ + if isinstance(item, str): + return item + elif not isinstance(item, dict): + raise ValueError(f'Expected a string or a dictionary describing a regular expression.') + pattern = item.get('pattern') + # The pattern is permitted to be a string or list of strings, since in a JSON-style file we can't + # do the thing we do in python where we just juxtapose several strings, separated by whitespace + # and/or newlines, in order to have them taken as a single literal string. -kmp 29-Sep-2023 + if isinstance(pattern, str): + pass + elif isinstance(pattern, list): + pattern = ''.join(pattern) + else: + raise ValueError(f"Invalid pattern expression: {item!r}") + flags = item.get('flags') or [] + compilation_flags = re.IGNORECASE # UNICODE will default, but IGNORECASE we have to set up manually + for flag in flags: + if isinstance(flag, str) and flag.isupper(): + if hasattr(re, flag): + compilation_flags |= getattr(re, flag) + else: + raise ValueError(f"No such flag re.{flag}") + else: + raise ValueError(f"Flags must be strigs: {flag!r}") + regexp = re.compile(pattern, compilation_flags) + return regexp + + +def read_license_policy_file(file): """ - This set of values is useful to us in Park Lab where these tools were developed. - If you're at some other organization, we recommend you make a class that has values - suitable to your own organizational needs. + Reads a license policy file, which is a JSONC file (can contain JSON with Javascript-style comments) + The policy is a dictionary, but the ALLOWED option is a list that can contain special syntax allowing + a regular expression to be inferred. See documentation of `string_or_regexp_dict` for details. """ + data = JsoncParser.parse_file(file) + allowed = data.get('ALLOWED') + if isinstance(allowed, list): + # The "ALLOWED" option is specially permitted to contain regular expressions. + data['ALLOWED'] = [literal_string_or_regexp_from_dict(allowance) for allowance in allowed] + return data + + +_MY_DIR = os.path.dirname(__file__) + +POLICY_DIR = os.path.join(_MY_DIR, "license_policies") + +POLICY_DATA_CACHE = {} + + +def built_in_policy_names(): + return [ + os.path.splitext(os.path.basename(license_policy_path))[0] + for license_policy_path in glob.glob(os.path.join(POLICY_DIR, "*.jsonc"))] + + +def find_policy_data(policy_name: str, policy_dir: Optional[str] = None, + use_cache: bool = True, error_if_missing: bool = True): + policy_dir = POLICY_DIR if policy_dir is None else policy_dir + existing_data = POLICY_DATA_CACHE.get(policy_name) if use_cache else None + if existing_data: + return existing_data + else: + filename = os.path.join(policy_dir, policy_name + ".jsonc") + if not os.path.exists(filename): + if error_if_missing: + raise ValueError(f"No such policy: {policy_name!r}") + else: + return None + data = read_license_policy_file(filename) + POLICY_DATA_CACHE[policy_name] = data + return data + + +def find_or_create_license_class(*, policy_name: str, policy_dir: str, + # This next argument should never be passed explicitly by callers other than + # recursive calls to this function. -kmp 28-Sep-2023 + _creation_attmpts_in_progress=None): + """ + Define a policy class given a policy name (like 'c4-infrastructure'). + """ + _creation_attmpts_in_progress = _creation_attmpts_in_progress or [] + existing_checker = LicenseCheckerRegistry.find_checker(checker_name=policy_name) + if existing_checker: + return existing_checker + elif policy_name in _creation_attmpts_in_progress: + raise ValueError(f"Circular reference to {policy_name} detected" + f" while creating {conjoined_list(_creation_attmpts_in_progress)}.") + _creation_attmpts_in_progress.append(policy_name) + license_checker_class_name = to_camel_case(policy_name) + "LicenseChecker" + policy_data = find_policy_data(policy_name, policy_dir=policy_dir) + inherits_from = policy_data.get('inherits_from') + if not isinstance(inherits_from, list): + raise ValueError(f'Policy {policy_name!r} needs "inherits_from": [...parent names...],' + f' which may be empty but must be specified.') + license_frameworks = policy_data.get('LICENSE_FRAMEWORKS') + if license_frameworks == "ALL": + policy_data['LICENSE_FRAMEWORKS'] = LicenseFrameworkRegistry.all_framework_names() + parent_classes = [find_or_create_license_class(policy_name=parent_name, policy_dir=policy_dir, + _creation_attmpts_in_progress=_creation_attmpts_in_progress) + for parent_name in inherits_from] + defaulted_policy_data = default_policy_data(policy_name=policy_name, policy_data=policy_data, + parent_classes=parent_classes) + new_class = type(license_checker_class_name, + (*parent_classes, LicenseChecker), + {'_policy_data': policy_data, **defaulted_policy_data}) + new_class.__doc__ = policy_data.get("description") or f'License policy {policy_name} needs a "description".' + # Sigh. PyCharm can't figure this out type fact out, even with a type hint on the above assignment to new_class, + # such as 'new_class: Type[LicenseChecker] = ...'. That should have worked. Putting in an assert was the only way + # I could find to convince PyCharm of the truth. I don't expect this assertion to ever fail. It's just an artifact + # to prevent ugly browser highlighting. I'll try to arrange a bug report for them. -kmp 29-Sep-2023 + assert isinstance(new_class, type) and issubclass(new_class, LicenseChecker) + license_policy_class: Type[LicenseChecker] = new_class + decorator = LicenseCheckerRegistry.register_checker(name=policy_name) + registered_class = decorator(license_policy_class) + if LicenseOptions.DEBUG: # pragma: no cover - this doesn't have to work for production + found_class = LicenseCheckerRegistry.lookup_checker(policy_name) + PRINT(f"Registered checker class {policy_name!r}" + f" with license_frameworks {conjoined_list(found_class.LICENSE_FRAMEWORKS)}.") + _creation_attmpts_in_progress.remove(policy_name) + return registered_class + + +def use_policy_literal(*, policy_name, policy_datum, other_policy_data): + """This is used for datum that requires no merging. The policy_datum is returned. Other arguments are ignored.""" + ignored(policy_name, other_policy_data) + return policy_datum + + +def str_or_regexp_sort_key(datum: Union[str, Regexp]): + """ + Returns a key for a datum that is an element of a list of elements that are strings or compiled regular expressions. + Regular expressions will sort where their parttern would be in the series of strings. + """ + # Rationale: We want something like this just to make testing predictable. + if isinstance(datum, str): + return datum + else: + return datum.pattern + - COPYRIGHT_OWNER = "President and Fellows of Harvard College" - LICENSE_TITLE = "(The )?MIT License" - LICENSE_FRAMEWORKS = ['python', 'javascript'] - - ALLOWED = [ - - # <> - # Ref: https://opensource.org/license/0bsd/ - '0BSD', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Academic Free License (AFL)', - 'AFL-2.1', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Apache Software License', - 'Apache-Style', - 'Apache-2.0', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'BSD License', - 'BSD-2-Clause', - 'BSD-3-Clause', - - # Linking = Public Domain, Private Use = Public Domain - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CC0', - 'CC0-1.0', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CC-BY', - 'CC-BY-3.0', - 'CC-BY-4.0', - - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'CDDL', - - # The original Eclipse Distribution License 1.0 is essentially a BSD-3-Clause license. - # Ref: https://www.eclipse.org/org/documents/edl-v10.php - 'Eclipse Distribution License', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Eclipse Public License', - 'EPL-2.0', - - # Linking = Yes, Cat = Permissive Software Licenses - # Ref: https://en.wikipedia.org/wiki/Historical_Permission_Notice_and_Disclaimer - 'Historical Permission Notice and Disclaimer (HPND)', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'ISC License (ISCL)', - 'ISC', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'MIT License', - 'MIT', - - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Mozilla Public License 2.0 (MPL 2.0)', - 'MPL-1.1', - 'MPL-2.0', - - # The SIL Open Font License appears to be a copyleft-style license that applies narrowly - # to icons and not to the entire codebase. It is advertised as OK for use even in commercial - # applications. - # Ref: https://fontawesome.com/license/free - 'OFL-1.1', - - # Ref: https://en.wikipedia.org/wiki/Public_domain - 'Public Domain', - - # Linking = Permissive, Private Use = Permissive - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Python Software Foundation License', - 'Python-2.0', - - # License = BSD-like - # Ref: https://en.wikipedia.org/wiki/Pylons_project - 'Repoze Public License', - - # Linking = Permissive/Public domain, Private Use = Permissive/Public domain - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'The Unlicense (Unlicense)', - 'Unlicense', - - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'W3C License', - 'W3C-20150513', - - # Linking = Permissive/Public Domain, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'WTFPL', - - # Copyleft = No - # Ref: https://en.wikipedia.org/wiki/Zlib_License - # Linking = Permissive, Private Use = ? (for zlib/libpng license) - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'Zlib', - - # Copyleft = No, FSF/OSI-approved: Yes - # Ref: https://en.wikipedia.org/wiki/Zope_Public_License - 'Zope Public License', - ] - - EXPECTED_MISSING_LICENSES = [ - - # This is a name we use for our C4 portals. And it isn't published. - # We inherited the name from the Stanford ENCODE group, which had an MIT-licensed repo we forked - 'encoded', # cgap-portal, fourfront, and smaht-portal all call themselves this - - # We believe that since these next here are part of the Pylons project, they're covered under - # the same license as the other Pylons projects. We're seeking clarification. - 'pyramid-translogger', - 'subprocess-middleware', - - # This appears to be a BSD 2-Clause "Simplified" License, according to GitHub. - # PyPi also says it's a BSD license. - # Ref: https://github.com/paulc/dnslib/blob/master/LICENSE - 'dnslib', - - # This says it wants an ISC License, which we already have approval for but just isn't showing up. - # Ref: https://github.com/rthalley/dnspython/blob/master/LICENSE - 'dnspython', - - # This appears to be a mostly-MIT-style license. - # There are references to parts being in the public domain, though it's not obvious if that's meaningful. - # It's probably sufficient for our purposes to treat this as a permissive license. - # Ref: https://github.com/tlsfuzzer/python-ecdsa/blob/master/LICENSE - 'ecdsa', - - # This has an MIT license in its source repository - # Ref: https://github.com/xlwings/jsondiff/blob/master/LICENSE - 'jsondiff', - - # This has an MIT license in its source repository - # Ref: https://github.com/pkerpedjiev/negspy/blob/master/LICENSE - 'negspy', - - # This license statement is complicated, but seems adequately permissive. - # Ref: https://foss.heptapod.net/python-libs/passlib/-/blob/branch/stable/LICENSE - 'passlib', - - # This seems to be a BSD-3-Clause license. - # Ref: https://github.com/protocolbuffers/protobuf/blob/main/LICENSE - # pypi agrees in the Meta section of protobuf's page, where it says "3-Clause BSD License" - # Ref: https://pypi.org/project/protobuf/ - 'protobuf', - - # The WTFPL license is permissive. - # Ref: https://github.com/mk-fg/pretty-yaml/blob/master/COPYING - 'pyaml', - - # This uses a BSD license - # Ref: https://github.com/eliben/pycparser/blob/master/LICENSE - 'pycparser', - - # The source repo for pyDes says this is under an MIT license - # Ref: https://github.com/twhiteman/pyDes/blob/master/LICENSE.txt - # pypi, probably wrongly, thinks this is in the public domain (as of 2023-07-21) - # Ref: https://pypi.org/project/pyDes/ - 'pyDes', - - # This uses an MIT license - # Ref: https://github.com/pysam-developers/pysam/blob/master/COPYING - 'pysam', - - # The version of python-lambda that we forked calls itself this (and publishes at pypi under this name) - "python-lambda-4dn", - - # This is MIT-licensed: - # Ref: https://github.com/themiurgo/ratelim/blob/master/LICENSE - # pypi agrees - # Ref: https://pypi.org/project/ratelim/ - 'ratelim', - - # This is a BSD-3-Clause-Modification license - # Ref: https://github.com/repoze/repoze.debug/blob/master/LICENSE.txt - 'repoze.debug', - - # This is an Apache-2.0 license - # Ref: https://github.com/getsentry/responses/blob/master/LICENSE - 'responses', - - # This seems to get flagged sometimes, but is not the pypi snovault library, it's what our dcicsnovault - # calls itself internally.. In any case, it's under MIT license and OK. - # Ref: https://github.com/4dn-dcic/snovault/blob/master/LICENSE.txt - 'snovault', - - # PyPi identifies the supervisor library license as "BSD-derived (http://www.repoze.org/LICENSE.txt)" - # Ref: https://pypi.org/project/supervisor/ - # In fact, though, the license is a bit more complicated, though apparently still permissive. - # Ref: https://github.com/Supervisor/supervisor/blob/main/LICENSES.txt - 'supervisor', - - # This seems to be a BSD-3-Clause-Modification license. - # Ref: https://github.com/Pylons/translationstring/blob/master/LICENSE.txt - 'translationstring', - - # This seems to be a BSD-3-Clause-Modification license. - # Ref: https://github.com/Pylons/venusian/blob/master/LICENSE.txt - 'venusian', - - # PyPi identifies zope.deprecation as using the "Zope Public License (ZPL 2.1)" license. - # Ref: https://github.com/zopefoundation/Zope/blob/master/LICENSE.txt - 'zope.deprecation', - - # Below are licenses last known to have licenses missing in pip-licenses and need to be investigated further. - # Note well that just because pip-licenses doesn't know the license doesn't mean the software has - # no license. It may just mean the library is poorly registered in pypi. Some licenses have to be - # found by looking at the library's documentation or source files. - - # (all of these have been classified at this point) - - ] - - EXCEPTIONS = { - - 'BSD*': [ - # Although modified to insert the author name into the license text itself, - # the license for these libraries are essentially BSD-3-Clause. - 'formatio', - 'samsam', - - # There are some slightly different versions of what appear to be BSD licenses here, - # but clearly the license is permissive. - # Ref: https://www.npmjs.com/package/mutation-observer?activeTab=readme - 'mutation-observer', - ], - - 'Custom: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global': [ - # The use of this URL appears to be a syntax error in the definition of entries-ponyfill - # In fact this seems to be covered by a CC0-1.0 license. - # Ref: https://unpkg.com/browse/object.entries-ponyfill@1.0.1/LICENSE - 'object.entries-ponyfill', - ], - - 'Custom: https://github.com/saikocat/colorbrewer.': [ - # The use of this URL appears to be a syntax error in the definition of cartocolor - # In fact, this seems to be covered by a CC-BY-3.0 license. - # Ref: https://www.npmjs.com/package/cartocolor?activeTab=readme - 'cartocolor', - ], - - 'Custom: https://travis-ci.org/component/emitter.png': [ - # The use of this png appears to be a syntax error in the definition of emitter-component. - # In fact, emitter-component uses an MIT License - # Ref: https://www.npmjs.com/package/emitter-component - # Ref: https://github.com/component/emitter/blob/master/LICENSE - 'emitter-component', - ], - - # The 'turfs-jsts' repository (https://github.com/DenisCarriere/turf-jsts/blob/master/README.md) - # seems to lack a license, but appears to be forked from the jsts library that uses - # the Eclipse Public License 1.0 and Eclipse Distribution License 1.0, so probably a permissive - # license is intended. - 'Custom: https://travis-ci.org/DenisCarriere/turf-jsts.svg': [ - 'turf-jsts' - ], - - # DFSG = Debian Free Software Guidelines - # Ref: https://en.wikipedia.org/wiki/Debian_Free_Software_Guidelines - # Used as an apparent modifier to other licenses, to say they are approved per Debian. - # For example in this case, pytest-timeout has license: DFSG approved, MIT License, - # but is really just an MIT License that someone has checked is DFSG approved. - 'DFSG approved': [ - 'pytest-timeout', # MIT Licensed - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v2 or later (LGPLv2+)': [ - 'chardet' # used at runtime during server operation (ingestion), but not modified or distributed - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Lesser General Public License v3 or later (LGPLv3+)': [ - 'pytest-redis', # used only privately in testing, not used in server code, not modified, not distributed - 'mirakuru', # required by pytest-redis (used only where it's used) - ], - - 'GNU General Public License (GPL)': [ - 'docutils', # Used only privately as a separate documentation-generation task for ReadTheDocs - ], - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - # 'GNU Lesser General Public License v3 or later (LGPLv3+)', - - # Linking = With Restrictions, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'GNU Library or Lesser General Public License (LGPL)': [ - 'psycopg2', # Used at runtime during server operation, but not modified or distributed - 'psycopg2-binary', # Used at runtime during server operation, but not modified or distributed - 'chardet', # Potentially used downstream in loadxl to detect charset for text files - 'pyzmq', # Used in post-deploy-perf-tests, not distributed, and not modified or distributed - ], - - 'GPL-2.0': [ - # The license file for the node-forge javascript library says: - # - # "You may use the Forge project under the terms of either the BSD License or the - # GNU General Public License (GPL) Version 2." - # - # (We choose to use it under the BSD license.) - # Ref: https://www.npmjs.com/package/node-forge?activeTab=code - 'node-forge', - ], - - 'MIT*': [ - - # This library uses a mix of licenses, but they (MIT, CC0) generally seem permissive. - # (It also mentions that some tools for building/testing use other libraries.) - # Ref: https://github.com/requirejs/domReady/blob/master/LICENSE - 'domready', - - # This library is under 'COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1' - # Ref: https://github.com/javaee/jsonp/blob/master/LICENSE.txt - # About CDDL ... - # Linking = Permissive, Private Use = ? - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'jsonp', - - # This library says pretty clearly it intends MIT license. - # Ref: https://www.npmjs.com/package/component-indexof - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'component-indexof', - - # These look like a pretty straight MIT license. - # Linking = Permissive, Private Use = Yes - # Ref: https://en.wikipedia.org/wiki/Comparison_of_free_and_open-source_software_licenses - 'mixin', # LICENSE file at https://www.npmjs.com/package/mixin?activeTab=code - 'stack-trace', # https://github.com/stacktracejs/stacktrace.js/blob/master/LICENSE - 'typed-function', # LICENSE at https://www.npmjs.com/package/typed-function?activeTab=code - - ], - - 'UNLICENSED': [ - # The udn-browser library is our own and has been observed to sometimes show up in some contexts - # as UNLICENSED, when really it's MIT. - # Ref: https://github.com/dbmi-bgm/udn-browser/blob/main/LICENSE - 'udn-browser', - ], - - } - - -class C4PythonInfrastructureLicenseChecker(C4InfrastructureLicenseChecker): +def merge_policy_lists(*, policy_name, policy_datum, other_policy_data, sort_key=None): """ - For situations like dcicutils and dcicsnovault where there's no Javascript, this will test just Python. + Merges a set of policy lists by appending them and de-duplicating. + By default, the result list is assumed to be homogenous in type and suitable for sorting. + If the list is of heterogeneous type, a sort_key is must be supplied to allow a total ordering. """ - LICENSE_FRAMEWORKS = ['python'] + ignored(policy_name) + result = policy_datum + for other_datum in other_policy_data: + result += other_datum + # de-duplicate and apply a deterministic ordering to make testing easier. + return sorted(set(result), key=sort_key) + + +def merge_policy_strings_or_regexps(*, policy_name, policy_datum, other_policy_data): + return merge_policy_lists(policy_name=policy_name, policy_datum=policy_datum, other_policy_data=other_policy_data, + sort_key=str_or_regexp_sort_key) + + +def merge_policy_dicts(*, policy_name, policy_datum, other_policy_data): + ignored(policy_name) + merged = defaultdict(lambda: []) + + def add_to_merged(d): + for k, values in d.items(): + for value in values: + merged[k].append(value) + + add_to_merged(policy_datum) + for other_datum in other_policy_data: + add_to_merged(other_datum) + + return {k: sorted(set(v)) for k, v in sorted(merged.items())} + + +POLICY_ATTRS: callable = { + 'class_key': use_policy_literal, + 'class_name': use_policy_literal, + 'inherits_from': use_policy_literal, + 'description': use_policy_literal, + 'LICENSE_TITLE': use_policy_literal, + 'COPYRIGHT_OWNER': use_policy_literal, + 'LICENSE_FRAMEWORKS': use_policy_literal, + 'ALLOWED': merge_policy_strings_or_regexps, + 'EXPECTED_MISSING_LICENSES': merge_policy_lists, + 'EXCEPTIONS': merge_policy_dicts, +} + +POLICY_MERGE_LISTS = {'ALLOWED', 'EXPECTED_MISSING_LICENSES'} +POLICY_MERGE_DICTS = {'EXCEPTIONS'} + + +def get_attrs_for_classes(attr: str, class_data: List[Type]): + result = [] + for class_datum in class_data: + attr_val = getattr(class_datum, attr, None) # Intentionally treats explicit None the same as missing + if attr_val is not None: + result.append(attr_val) + return result + + +def default_policy_data(*, policy_name: str, policy_data: AnyJsonData, parent_classes: List[Type]): + result = {} + for key_to_default, val_to_be_defaulted in policy_data.items(): + attr_handler: Optional[callable] = POLICY_ATTRS.get(key_to_default) + if attr_handler is None: + raise ValueError(f"Bad policy attribute: {key_to_default}") + result[key_to_default] = attr_handler(policy_name=policy_name, policy_datum=val_to_be_defaulted, + other_policy_data=get_attrs_for_classes(key_to_default, parent_classes)) + return result + + +def load_license_policies(policy_dir=None): + for policy_name in built_in_policy_names(): + find_or_create_license_class(policy_name=policy_name, policy_dir=policy_dir) + + +# This will cause the definitions of classes to in the predefined set to be exported by this library +# in case they need to be imported elsewhere, for example to use in unit-testing. Those are things like +# * ParkLabCommonLicenseChecker, etc. +# * C4InfrastructureLicenseChecker, etc. +# See license_policies/*.jsonc for a full list. +load_license_policies() + +ParkLabCommonLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-common') +ParkLabCommonServerLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-common-server') +ParkLabPipelineLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-pipeline') +ParkLabGplPipelineLicenseChecker = LicenseCheckerRegistry.lookup_checker('park-lab-gpl-pipeline') +C4InfrastructureLicenseChecker = LicenseCheckerRegistry.lookup_checker('c4-infrastructure') +C4PythonInfrastructureLicenseChecker = LicenseCheckerRegistry.lookup_checker('c4-python-infrastructure') diff --git a/dcicutils/misc_utils.py b/dcicutils/misc_utils.py index 88c228c7f..66ef4a371 100644 --- a/dcicutils/misc_utils.py +++ b/dcicutils/misc_utils.py @@ -7,10 +7,11 @@ import functools import hashlib import inspect -import math import io -import os +import json import logging +import math +import os import pytz import re import rfc3986.validators @@ -20,8 +21,8 @@ import webtest # importing the library makes it easier to mock testing from collections import defaultdict -from dateutil.parser import parse as dateutil_parse from datetime import datetime as datetime_type +from dateutil.parser import parse as dateutil_parse from typing import Optional @@ -296,6 +297,9 @@ def app(self): return self.wrapped_app.app +VirtualAppResponse = webtest.response.TestResponse + + def exported(*variables): """ This function does nothing but is used for declaration purposes. @@ -1307,6 +1311,11 @@ def file_contents(filename, binary=False): return fp.read() +def json_file_contents(filename): + with io.open(filename, 'r') as fp: + return json.load(fp) + + def camel_case_to_snake_case(s, separator='_'): """ Converts CamelCase to snake_case. @@ -1335,7 +1344,11 @@ def to_camel_case(s): """ Converts a string that might be in snake_case or CamelCase into CamelCase. """ - if s[:1].isupper() and '_' not in s: + hyphen_found = False + if '-' in s: + hyphen_found = True + s = s.replace('-', '_') + if not hyphen_found and s[:1].isupper() and '_' not in s: return s else: return snake_case_to_camel_case(s) diff --git a/dcicutils/scripts/run_license_checker.py b/dcicutils/scripts/run_license_checker.py new file mode 100644 index 000000000..7cafdae37 --- /dev/null +++ b/dcicutils/scripts/run_license_checker.py @@ -0,0 +1,83 @@ +import argparse + +from dcicutils.command_utils import script_catch_errors, ScriptFailure +from dcicutils.lang_utils import there_are, conjoined_list +from dcicutils.license_utils import LicenseOptions, LicenseCheckerRegistry, LicenseChecker, LicenseCheckFailure +from dcicutils.misc_utils import PRINT, get_error_message +from typing import Optional, Type + + +EPILOG = __doc__ + + +ALL_CHECKER_NAMES = sorted(LicenseCheckerRegistry.all_checker_names(), + key=lambda x: 'aaaaa-' + x if x.startswith('park-lab-') else x) +NEWLINE = '\n' + + +def main(): + + parser = argparse.ArgumentParser( + description="Runs a license checker", + epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("name", type=str, default=None, nargs='?', + help=f"The name of a checker to run. " + + there_are(ALL_CHECKER_NAMES, kind='available checker', + show=True, joiner=conjoined_list, punctuate=True)) + parser.add_argument("--brief", '-b', default=False, action="store_true", + help="Requests brief output.") + parser.add_argument("--debug", '-q', default=False, action="store_true", + help="Requests additional debugging output.") + parser.add_argument("--conda-prefix", "--conda_prefix", "--cp", default=LicenseOptions.CONDA_PREFIX, + help=f"Overrides the CONDA_PREFIX (default {LicenseOptions.CONDA_PREFIX!r}).") + parser.add_argument("--policy-dir", "--policy_dir", "--pd", default=LicenseOptions.POLICY_DIR, + help=f"Specifies a custom policy directory (default {LicenseOptions.POLICY_DIR!r}).") + + args = parser.parse_args() + + with script_catch_errors(): + run_license_checker(name=args.name, verbose=not args.brief, debug=args.debug, conda_prefix=args.conda_prefix, + policy_dir=args.policy_dir) + + +def show_help_for_choosing_license_checker(): + PRINT("") + PRINT(there_are(ALL_CHECKER_NAMES, kind='available checker', show=False, punctuation_mark=':')) + PRINT("") + wid = max(len(x) for x in ALL_CHECKER_NAMES) + 1 + for checker_name in ALL_CHECKER_NAMES: + checker_class = LicenseCheckerRegistry.lookup_checker(checker_name) + checker_doc = (checker_class.__doc__ or '').strip(' \t\n\r') + PRINT(f"{(checker_name + ':').ljust(wid)} {checker_doc.split(NEWLINE)[0]}") + PRINT("") + PRINT("=" * 42, "NOTES & DISCLAIMERS", "=" * 42) + PRINT("Park Lab is a research laboratory in the Department of Biomedical Informatics at Harvard Medical School.") + PRINT("Park Lab checkers are intended for internal use and may not be suitable for other purposes.") + PRINT("External organizations must make their own independent choices about license acceptability.") + PRINT("Such choices can be integrated with this tool as follows:") + PRINT(" * Import LicenseChecker and LicenseCheckerRegistry from dcicutils.license_utils.") + PRINT(" * Make your own subclass of LicenseChecker, specifying a doc string and appropriate constraints.") + PRINT(" * Decorate your subclass with an appropriate call to LicenseCheckerRegistry.register_checker.") + PRINT("") + + +def run_license_checker(name: Optional[str], + verbose=LicenseOptions.VERBOSE, + debug=LicenseOptions.DEBUG, + conda_prefix=LicenseOptions.CONDA_PREFIX, + policy_dir=LicenseOptions.POLICY_DIR): + if name is None: + show_help_for_choosing_license_checker() + else: + with LicenseOptions.selected_options(verbose=verbose, debug=debug, conda_prefix=conda_prefix, + policy_dir=policy_dir): + try: + checker_class: Type[LicenseChecker] = LicenseCheckerRegistry.lookup_checker(name, autoload=True) + except Exception as e: + raise ScriptFailure(str(e)) + try: + checker_class.validate() + except LicenseCheckFailure as e: + raise ScriptFailure(get_error_message(e)) diff --git a/poetry.lock b/poetry.lock index d7e77523c..c59c8f953 100644 --- a/poetry.lock +++ b/poetry.lock @@ -884,6 +884,18 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "jsonc-parser" +version = "1.1.5" +description = "A lightweight, native tool for parsing .jsonc files" +category = "main" +optional = false +python-versions = ">=3.5" +files = [ + {file = "jsonc-parser-1.1.5.tar.gz", hash = "sha256:7126d17725b0413cd40af4297d9f6412c4181a62135e4c41cdf8f6a82c5936e6"}, + {file = "jsonc_parser-1.1.5-py3-none-any.whl", hash = "sha256:abd1db76a4c6d1733ec7bb5340a89c49cbc878a181a1e7947ee6719eedf2c6cc"}, +] + [[package]] name = "mccabe" version = "0.7.0" @@ -1594,4 +1606,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more [metadata] lock-version = "2.0" python-versions = ">=3.7,<3.10" -content-hash = "b8d6612bb28cfb9da79306a82b2ac35a20678e1f62ef86c93b8af3c3d1ed798e" +content-hash = "ca11caee3bf14b381e0aaec68ca6bca23f89064db9d90a61e9500e23eab8106f" diff --git a/pyproject.toml b/pyproject.toml index 65dba0353..0ef6b8f04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "7.11.0" +version = "7.14.0" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT" @@ -45,6 +45,7 @@ elasticsearch = "7.13.4" aws-requests-auth = ">=0.4.2,<1" docker = "^4.4.4" gitpython = "^3.1.2" +jsonc-parser = "^1.1.5" pytz = ">=2020.4" PyYAML = ">=5.1,<5.5" requests = "^2.21.0" @@ -81,6 +82,8 @@ pytest-runner = ">=5.1" [tool.poetry.scripts] publish-to-pypi = "dcicutils.scripts.publish_to_pypi:main" show-contributors = "dcicutils.contribution_scripts:show_contributors_main" +run-license-checker = "dcicutils.scripts.run_license_checker:main" + [tool.pytest.ini_options] addopts = "--basetemp=/tmp/pytest" diff --git a/test/test_ff_utils.py b/test/test_ff_utils.py index 92dc077be..742a4283e 100644 --- a/test/test_ff_utils.py +++ b/test/test_ff_utils.py @@ -1397,7 +1397,7 @@ def test_get_schema_with_vapp(): mock_get_authentication_with_server.assert_not_called() mock_get_metadata.assert_not_called() - sample_vapp.get.assert_called_once_with('profiles/User.json?frame=raw') + sample_vapp.get.assert_called_once_with('/profiles/User.json?frame=raw') @pytest.mark.unit @@ -1449,7 +1449,7 @@ def test_get_schemas_with_vapp(): mock_get_authentication_with_server.assert_not_called() mock_get_metadata.assert_not_called() - sample_vapp.get.assert_called_once_with('profiles/?frame=raw') + sample_vapp.get.assert_called_once_with('/profiles/?frame=raw') def test_get_schemas_options(): diff --git a/test/test_glacier_utils.py b/test/test_glacier_utils.py index e69bbc3cb..b44c50cd6 100644 --- a/test/test_glacier_utils.py +++ b/test/test_glacier_utils.py @@ -27,6 +27,7 @@ def mock_health_page() -> dict: 'file_upload_bucket': 'cgap-dummy-main-application-cgap-dummy-files', 'namespace': 'cgap-dummy', 'processed_file_bucket': 'cgap-dummy-main-application-cgap-dummy-wfoutput', + 's3_encrypt_key_id': 'dummy_kms_key', } diff --git a/test/test_license_utils.py b/test/test_license_utils.py index 78eba6905..e0d0fdc25 100644 --- a/test/test_license_utils.py +++ b/test/test_license_utils.py @@ -1,19 +1,28 @@ import copy import datetime +import glob import io import json import os import pytest +import re import subprocess as subprocess_module from collections import defaultdict +from dcicutils import license_utils as license_utils_module +from dcicutils.common import Regexp from dcicutils.license_utils import ( - LicenseFrameworkRegistry, LicenseFramework, PythonLicenseFramework, JavascriptLicenseFramework, + POLICY_DIR, + LicenseOptions, LicenseFrameworkRegistry, LicenseFramework, LicenseCheckerRegistry, + PythonLicenseFramework, JavascriptLicenseFramework, CondaLicenseFramework, RLicenseFramework, LicenseAnalysis, LicenseChecker, LicenseStatus, LicenseFileParser, LicenseCheckFailure, LicenseOwnershipCheckFailure, LicenseAcceptabilityCheckFailure, warnings as license_utils_warnings_module, + extract_boolean_terms, simplify_license_versions, load_license_policies, literal_string_or_regexp_from_dict, + default_policy_data, str_or_regexp_sort_key, get_attrs_for_classes, find_or_create_license_class, + use_policy_literal, merge_policy_lists, merge_policy_strings_or_regexps, merge_policy_dicts, built_in_policy_names, ) -from dcicutils.misc_utils import ignored, file_contents +from dcicutils.misc_utils import ignored, file_contents, local_attrs from dcicutils.qa_utils import printed_output, MockFileSystem from unittest import mock @@ -134,12 +143,12 @@ def test_license_framework_registry_register(): # decorator with LicenseFrameworkRegistry.temporary_registration_for_testing(): with pytest.raises(ValueError): - @LicenseFrameworkRegistry.register(name='bogus_dummy') + @LicenseFrameworkRegistry.register_framework(name='bogus_dummy') class BogusDummyLicenseFramework: pass ignored(BogusDummyLicenseFramework) - @LicenseFrameworkRegistry.register(name='dummy') + @LicenseFrameworkRegistry.register_framework(name='dummy') class DummyLicenseFramework(LicenseFramework): pass @@ -157,8 +166,10 @@ def test_license_framework_registry_all_frameworks(): assert all(isinstance(framework, type) and issubclass(framework, LicenseFramework) for framework in frameworks) assert sorted(frameworks, key=lambda x: x.NAME) == [ + CondaLicenseFramework, JavascriptLicenseFramework, PythonLicenseFramework, + RLicenseFramework ] @@ -166,7 +177,7 @@ def test_license_framework_registry_find_framework(): with LicenseFrameworkRegistry.temporary_registration_for_testing(): - @LicenseFrameworkRegistry.register(name='dummy1') + @LicenseFrameworkRegistry.register_framework(name='dummy1') class DummyLicenseFramework1(LicenseFramework): pass @@ -184,10 +195,58 @@ class DummyLicenseFramework1(LicenseFramework): LicenseFrameworkRegistry.find_framework(1) # noQA - arg is intentionally of wrong type for testing -def test_javascript_license_framework_implicated_licenses(): +def test_javascript_license_framework_strip_version(): + + print() # start on fresh line + + strip_version = JavascriptLicenseFramework.strip_version + + assert strip_version('') == '' + + assert strip_version('foo') == 'foo' + assert strip_version('foo@bar') == 'foo@bar' + + assert strip_version('foo@3') == 'foo' + assert strip_version('foo@3.1') == 'foo' + assert strip_version('foo@3.1.0') == 'foo' + assert strip_version('foo@3.1.0b3') == 'foo' + assert strip_version('foo@3.1-beta') == 'foo' + + assert strip_version("@foo-3.1-beta") == '@foo-3.1-beta' # we don't treat leading '@' as a version marker + assert strip_version('foo@.9') == 'foo' # we tolerate a leading dot even though it's probably bad form + assert strip_version('foo@beta-3.9') == 'foo@beta-3.9' # treating suffix as version here is farther than we'll go + + +@pytest.mark.parametrize('debug', [False, True]) +def test_simplify_license_versions(debug): + + def test_it(spec, expected): + with local_attrs(LicenseOptions, DEBUG=True): + with printed_output() as printed: + assert simplify_license_versions(spec, for_package_name='ignored') == expected + assert printed.last == f"Rewriting {spec!r} as {expected!r}." + + test_it('GPL (version 2)', 'GPL-2') + test_it('GPL (version 2.0)', 'GPL-2.0') + test_it('GPL (= 2.0)', 'GPL-2.0') + test_it('GPL (= 2.1)', 'GPL-2.1') + + test_it('GPL (>= 2)', 'GPL-2+') + test_it('GPL (>= 2.0)', 'GPL-2.0+') + test_it('GPL (version 2 or greater)', 'GPL-2+') + test_it('GPL (version 2 or later)', 'GPL-2+') + + +@pytest.mark.parametrize('debug', [False, True]) +def test_extract_boolean_terms(debug): + + print() # start on a blank line def check_implications(spec, implications): - assert JavascriptLicenseFramework.implicated_licenses(licenses_spec=spec) == implications + with local_attrs(LicenseOptions, DEBUG=debug): + with printed_output() as printed: + assert extract_boolean_terms(spec, for_package_name='ignored') == implications + assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else []) check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) @@ -196,41 +255,66 @@ def check_implications(spec, implications): check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) + sample_package = 'some-package' + assert extract_boolean_terms('MIT or file FOO', for_package_name=sample_package) == [ + f'Custom: {sample_package} file FOO', + 'MIT', + ] -def test_javascript_license_framework_get_licenses(): - print() # start on a fresh line - packages = {} - for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1): - package = f'package{i}' - packages[f"package{i}"] = { - "licenses": license, - "repository": f"https://github.com/dummy/{package}", - "publisher": f"J Dummy{i}", - "email": f"jdummy{i}@dummyhost.example.com", - "path": f"/some/path/to/package{i}", - "licenseFile": f"/some/path/to/package{i}/license" - } - subprocess_output = json.dumps(packages) - with mock.patch.object(subprocess_module, "check_output") as mock_check_output: - mock_check_output.return_value = subprocess_output - with printed_output() as printed: - assert JavascriptLicenseFramework.get_dependencies() == [ - {'licenses': ['Apache-2.0'], 'name': 'package1'}, - {'licenses': ['MIT'], 'name': 'package2'}, - {'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, - {'licenses': [], 'name': 'package4'}, - ] - assert printed.lines == [ - "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']" - ] +@pytest.mark.parametrize('debug', [False, True]) +def test_javascript_license_framework_implicated_licenses(debug): - # A special case for missing data... - mock_check_output.return_value = "{}\n\n" - with pytest.raises(Exception) as esc: - # When no package data is available, {} gets returned, and we need to complain this is odd. - JavascriptLicenseFramework.get_dependencies() - assert str(esc.value) == "No javascript license data was found." + def check_implications(spec, implications): + with local_attrs(LicenseOptions, DEBUG=debug): + with printed_output() as printed: + assert JavascriptLicenseFramework.implicated_licenses(package_name='ignored', + licenses_spec=spec) == implications + assert printed.lines == ([f"Rewriting {spec!r} as {implications!r}."] if debug else []) + + check_implications(spec='(MIT AND BSD-3-Clause)', implications=['BSD-3-Clause', 'MIT']) + check_implications(spec='(CC-BY-4.0 AND OFL-1.1 AND MIT)', implications=['CC-BY-4.0', 'MIT', 'OFL-1.1']) + + check_implications(spec='(MIT OR Apache-2.0)', implications=['Apache-2.0', 'MIT']) + + check_implications(spec='(FOO OR (BAR AND BAZ))', implications=['BAR', 'BAZ', 'FOO']) + + +@pytest.mark.parametrize('debug', [False, True]) +def test_javascript_license_framework_get_licenses(debug): + + with local_attrs(LicenseOptions, DEBUG=debug): + print() # start on a fresh line + packages = {} + for i, license in enumerate(['Apache-2.0', 'MIT', '(MIT OR Apache-2.0)', ''], start=1): + package = f'package{i}' + packages[f"package{i}"] = { + "licenses": license, + "repository": f"https://github.com/dummy/{package}", + "publisher": f"J Dummy{i}", + "email": f"jdummy{i}@dummyhost.example.com", + "path": f"/some/path/to/package{i}", + "licenseFile": f"/some/path/to/package{i}/license" + } + subprocess_output = json.dumps(packages) + with mock.patch.object(subprocess_module, "check_output") as mock_check_output: + mock_check_output.return_value = subprocess_output + with printed_output() as printed: + assert JavascriptLicenseFramework.get_dependencies() == [ + {'framework': 'javascript', 'licenses': ['Apache-2.0'], 'name': 'package1'}, + {'framework': 'javascript', 'licenses': ['MIT'], 'name': 'package2'}, + {'framework': 'javascript', 'licenses': ['Apache-2.0', 'MIT'], 'name': 'package3'}, + {'framework': 'javascript', 'licenses': [], 'name': 'package4'}, + ] + expected_rewrite_description = "Rewriting '(MIT OR Apache-2.0)' as ['Apache-2.0', 'MIT']." + assert printed.lines == ([expected_rewrite_description] if debug else []) + + # A special case for missing data... + mock_check_output.return_value = "{}\n\n" + with pytest.raises(Exception) as esc: + # When no package data is available, {} gets returned, and we need to complain this is odd. + JavascriptLicenseFramework.get_dependencies() + assert str(esc.value) == "No javascript license data was found." def test_python_license_framework_piplicenses_args(): @@ -623,8 +707,10 @@ def test_license_checker_analyze_license_dependencies_by_framework(): analysis = LicenseAnalysis() LicenseChecker.analyze_license_dependencies_by_framework(analysis=analysis, frameworks=None) assert mock_analyze.mock_calls == [ + mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=CondaLicenseFramework), mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=JavascriptLicenseFramework), mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=PythonLicenseFramework), + mock.call(analysis=analysis, acceptable=None, exceptions=None, framework=RLicenseFramework), ] @@ -681,3 +767,221 @@ def mocked_license_logger(message): LicenseFileParser.validate_simple_license_file(filename='LICENSE.txt', analysis=analysis) assert analysis.miscellaneous == ["The copyright year, '2020', should have '2023' at the end."] assert license_warnings == [] + + +def test_default_policy_data(): + + class MyCondaClass(LicenseChecker): + LICENSE_FRAMEWORKS = ['conda'] + EXCEPTIONS = { + 'something': ['some-lib'] + } + + def check_it(input, expected, *, parents=None): + parents = parents or [] + assert default_policy_data(policy_name='some-policy', policy_data=input, parent_classes=parents) == expected + + check_it({'LICENSE_FRAMEWORKS': ['a', 'b']}, {'LICENSE_FRAMEWORKS': ['a', 'b']}) + check_it({'LICENSE_FRAMEWORKS': ['a', 'b']}, {'LICENSE_FRAMEWORKS': ['a', 'b']}, parents=[MyCondaClass]) + check_it({}, {}, parents=[MyCondaClass]) + + check_it( + { + 'EXCEPTIONS': { + 'something': ['some-random-lib'], + 'something-else': ['some-other-lib'] + } + }, + { + 'EXCEPTIONS': { + 'something': ['some-lib', 'some-random-lib'], + 'something-else': ['some-other-lib'] + } + }, + parents=[MyCondaClass]) + + +def test_use_policy_literal(): + + class MyIgnoredLicenseChecker(LicenseChecker): + pass + + assert use_policy_literal(policy_name='ignored', policy_datum='anything', + other_policy_data=[MyIgnoredLicenseChecker]) == 'anything' + + +def test_str_or_regexp_sort_key(): + + assert str_or_regexp_sort_key('foo') == 'foo' + assert str_or_regexp_sort_key(re.compile('foo')) == 'foo' + + +def test_merge_policy_lists(): + + list1 = ['a', 'c', 'b'] + list2 = ['f', 'a'] + list3 = ['g', 'a'] + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[]) + expected = ['a', 'b', 'c'] + assert actual == expected + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[list2]) + expected = ['a', 'b', 'c', 'f'] + assert actual == expected + + actual = merge_policy_lists(policy_name='ignored', policy_datum=list1, other_policy_data=[list2, list3]) + expected = ['a', 'b', 'c', 'f', 'g'] + assert actual == expected + + with pytest.raises(Exception): + merge_policy_lists(policy_name='ignored', policy_datum=['a', re.compile('foo')], other_policy_data=[]) + + +def test_merge_policy_strings_or_regexps(): + + regexp_foo = re.compile('foo') + regexp_bar = re.compile('bar') + + list1 = ['a', regexp_foo, 'c', 'b'] + list2 = ['f', regexp_bar, 'a'] + list3 = [regexp_foo, 'g', 'a'] + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, other_policy_data=[]) + expected = ['a', 'b', 'c', regexp_foo] + assert actual == expected + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, other_policy_data=[list2]) + expected = ['a', 'b', regexp_bar, 'c', 'f', regexp_foo] + assert actual == expected + + actual = merge_policy_strings_or_regexps(policy_name='ignored', policy_datum=list1, + other_policy_data=[list2, list3]) + expected = ['a', 'b', regexp_bar, 'c', 'f', regexp_foo, 'g'] + assert actual == expected + + +def test_merge_policy_dicts(): + + dict1 = {'foo': ['a', 'b'], 'bar': ['x', 'z']} + dict2 = {'alpha': ['p', 'q']} + dict3 = {'foo': ['a', 'c'], 'baz': ['z', 'w']} + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[]) + expected = {'bar': ['x', 'z'], 'foo': ['a', 'b']} + assert actual == expected + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[dict2]) + expected = {'alpha': ['p', 'q'], 'bar': ['x', 'z'], 'foo': ['a', 'b']} + assert actual == expected + + actual = merge_policy_dicts(policy_name='ignored', policy_datum=dict1, other_policy_data=[dict2, dict3]) + expected = {'alpha': ['p', 'q'], 'bar': ['x', 'z'], 'baz': ['w', 'z'], 'foo': ['a', 'b', 'c']} + assert actual == expected + + +def test_get_attrs_for_classes(): + + class ClassA: + PROP1 = 'val1A' + PROP2 = 'val2A' + + class ClassB: + PROP2 = 'val2B' + PROP3 = 'val3B' + + class ClassC: + PROP1 = 'val1C' + + class ClassAB(ClassA): + PROP1 = 'val1AB' + PROP2 = None + + # Note that the order of the results is the order of the classes in which the value occurs, NOT alphabetical. + + assert get_attrs_for_classes('PROP1', [ClassA]) == ['val1A'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB]) == ['val1A'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB, ClassC]) == ['val1A', 'val1C'] + assert get_attrs_for_classes('PROP1', [ClassA, ClassB, ClassC, ClassAB]) == ['val1A', 'val1C', 'val1AB'] + assert get_attrs_for_classes('PROP1', [ClassAB, ClassA, ClassB, ClassC]) == ['val1AB', 'val1A', 'val1C'] + + assert get_attrs_for_classes('PROP2', [ClassA]) == ['val2A'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB]) == ['val2A', 'val2B'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB, ClassC]) == ['val2A', 'val2B'] + assert get_attrs_for_classes('PROP2', [ClassA, ClassB, ClassC, ClassAB]) == ['val2A', 'val2B'] # None is ignored + assert get_attrs_for_classes('PROP2', [ClassAB, ClassA, ClassB, ClassC]) == ['val2A', 'val2B'] # ditto + + assert get_attrs_for_classes('PROP3', [ClassA]) == [] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC, ClassAB]) == ['val3B'] + assert get_attrs_for_classes('PROP3', [ClassA, ClassB, ClassC, ClassAB]) == ['val3B'] + + +def test_literal_string_or_regexp_from_dict(): + + print() # start on a fresh line + + sample_string = "foo" + assert literal_string_or_regexp_from_dict(sample_string) == sample_string + + sample_regexp_pattern_1 = "foo.*" + sample_regexp_pattern_2 = "(bar)" + sample_regexp_pattern_3 = sample_regexp_pattern_1 + sample_regexp_pattern_2 + + default_flags = re.UNICODE | re.IGNORECASE + + result = literal_string_or_regexp_from_dict({"pattern": sample_regexp_pattern_1}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_1 + assert result.flags == default_flags + + result = literal_string_or_regexp_from_dict({"pattern": [sample_regexp_pattern_1, sample_regexp_pattern_2]}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_3 + + result = literal_string_or_regexp_from_dict({"pattern": sample_regexp_pattern_1, "flags": ["VERBOSE"]}) + assert isinstance(result, Regexp) + assert result.pattern == sample_regexp_pattern_1 + assert result.flags == default_flags | re.VERBOSE + + +def test_find_or_create_license_class(): + test_registry = {} + policy_data_cache = {} + + class TestChecker(LicenseChecker): + pass + + with mock.patch.object(license_utils_module, "find_policy_data") as mock_find_policy_data: + with mock.patch.object(LicenseCheckerRegistry, "REGISTRY", test_registry): + with mock.patch.object(license_utils_module, "POLICY_DATA_CACHE", policy_data_cache): + + # This tests the find part + test_registry['test'] = TestChecker + assert find_or_create_license_class(policy_name='test', policy_dir='ignored') == TestChecker + mock_find_policy_data.assert_not_called() + + mock_find_policy_data.return_value = {"inherits_from": []} + policy_class = find_or_create_license_class(policy_name='something', policy_dir='/my/policy/dir') + assert issubclass(policy_class, LicenseChecker) + + +def test_load_license_policies(): + test_policy_names = ['my_project', 'your_project'] + policy_dir_for_testing = 'some/dir/' + with mock.patch.object(license_utils_module, "find_or_create_license_class") as mock_find_or_create_license_class: + with mock.patch.object(license_utils_module, "built_in_policy_names") as mock_built_in_policy_names: + mock_built_in_policy_names.return_value = test_policy_names + load_license_policies(policy_dir=policy_dir_for_testing) + mock_find_or_create_license_class.assert_has_calls([ + mock.call(policy_name=policy_name, policy_dir=policy_dir_for_testing) + for policy_name in test_policy_names + ]) + + +def test_built_in_policy_names(): + test_project_names = ['my_project', 'your_project'] + with mock.patch.object(glob, "glob") as mock_glob_glob: + mock_glob_glob.return_value = [os.path.join(POLICY_DIR, f"{name}.jsonc") for name in test_project_names] + assert built_in_policy_names() == test_project_names diff --git a/test/test_misc_utils.py b/test/test_misc_utils.py index a07c6d234..5b80a8ae7 100644 --- a/test/test_misc_utils.py +++ b/test/test_misc_utils.py @@ -30,7 +30,7 @@ classproperty, classproperty_cached, classproperty_cached_each_subclass, Singleton, NamedObject, obsolete, ObsoleteError, CycleError, TopologicalSorter, keys_and_values_to_dict, dict_to_keys_and_values, is_c4_arn, deduplicate_list, chunked, parse_in_radix, format_in_radix, managed_property, future_datetime, - MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, + MIN_DATETIME, MIN_DATETIME_UTC, INPUT, builtin_print, map_chunked, to_camel_case, json_file_contents, ) from dcicutils.qa_utils import ( Occasionally, ControlledTime, override_environ as qa_override_environ, MockFileSystem, printed_output, @@ -1788,6 +1788,16 @@ def test_file_contents(): assert file_contents("foo.bin", binary=False) == 'Hello!\n' +def test_json_file_contents(): + + mfs = MockFileSystem() + sample_data = {"foo": 1, "bar": [2, True]} + with mock.patch("io.open", mfs.open): + with io.open("foo.txt", 'w') as fp: + json.dump(sample_data, fp) + assert json_file_contents("foo.txt") == sample_data + + def test_make_counter(): counter = make_counter() @@ -1990,8 +2000,9 @@ def test_snake_case_to_camel_case_hyphenated(token, expected): ('x_m_l_container', 'XMLContainer'), ('X_M_L_Container', 'XMLContainer'), ]) -def test_to_camel_case_hyphenated(token, expected): +def test_to_camel_case(token, expected): assert to_camel_case(token) == expected + assert to_camel_case(token.replace('_', '-')) == expected assert to_camel_case(expected) == expected # make sure it's stable