From f5b76416daed8e24fbedc1bcf2d445f58e096818 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 8 Jul 2024 15:35:04 +0200 Subject: [PATCH] Ensure we can reindex metadata #496 #494 * Optionally override package data with new, non-empty manifest data * Expose new experimental API endpoint extra to recollect manifest data with a rescan at collect/reindex_metadata with this override capability * Enable this for Maven only for now Reference: https://github.com/nexB/purldb/issues/496 Reference: https://github.com/nexB/purldb/issues/494 Signed-off-by: Philippe Ombredanne --- minecode/model_utils.py | 20 ++++++++---- minecode/visitors/maven.py | 24 +++++++++----- packagedb/api.py | 67 ++++++++++++++++++++++++++++++++++++++ packagedb/models.py | 8 +++-- 4 files changed, 103 insertions(+), 16 deletions(-) diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 6bb6f998..97e875a8 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -219,19 +219,27 @@ def merge_packages(existing_package, new_package_data, replace=False): return updated_fields -def merge_or_create_package(scanned_package, visit_level): +def merge_or_create_package(scanned_package, visit_level, override=False): """ - Update Package from `scanned_package` instance if `visit_level` is greater + Update Package from ``scanned_package`` instance if `visit_level` is greater than the mining level of the existing package. - If `scanned_package` does not exist in the PackageDB, create a new entry in - the PackageDB for `scanned_package`. + If ``scanned_package`` does not exist in the PackageDB, create a new entry in + the PackageDB for ``scanned_package``. + + If ``override`` is True, then all existing empty values of the PackageDB package are replaced by + a non-empty value of the provided override. """ created = False merged = False package = None map_error = '' + mining_level = visit_level + if override: + # this will force the data override + visit_level =+1 + if not isinstance(scanned_package, PackageData): msg = 'Not a ScanCode PackageData type:' + repr(scanned_package) map_error += msg + '\n' @@ -297,7 +305,7 @@ def merge_or_create_package(scanned_package, visit_level): # package and the existing package, the existing package parties should be # deleted first and then the new package's parties added. - stored_package.mining_level = visit_level + stored_package.mining_level = mining_level if updated_fields: data = { @@ -335,7 +343,7 @@ def merge_or_create_package(scanned_package, visit_level): filename=fileutils.file_name(package_uri), # TODO: update the PackageDB model release_date=scanned_package.release_date, - mining_level=visit_level, + mining_level=mining_level, type=scanned_package.type, namespace=scanned_package.namespace, name=scanned_package.name, diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 95d9be35..c1842580 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -242,11 +242,13 @@ def merge_ancestors(ancestor_pom_texts, package): return package -def map_maven_package(package_url, package_content, pipelines): +def map_maven_package(package_url, package_content, pipelines, reindex_metadata=False): """ Add a maven `package_url` to the PackageDB. Return an error string if errors have occured in the process. + + if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. """ from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package @@ -307,20 +309,22 @@ def map_maven_package(package_url, package_content, pipelines): sha1 = get_package_sha1(package) if sha1: package.sha1 = sha1 - db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + override = reindex_metadata + db_package, _, _, _ = merge_or_create_package(package, visit_level=50, override=override) else: msg = f'Failed to retrieve JAR: {package_url}' error += msg + '\n' logger.error(msg) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package, pipelines) + + if not reindex_metadata: + # Submit package for scanning + if db_package: + add_package_to_scan_queue(package=db_package, pipelines=pipelines) return db_package, error -def map_maven_binary_and_source(package_url, pipelines): +def map_maven_binary_and_source(package_url, pipelines, reindex_metadata=False): """ Get metadata for the binary and source release of the Maven package `package_url` and save it to the PackageDB. @@ -332,6 +336,7 @@ def map_maven_binary_and_source(package_url, pipelines): package_url=package_url, package_content=PackageContentType.BINARY, pipelines=pipelines, + reindex_metadata=reindex_metadata, ) if emsg: error += emsg @@ -342,6 +347,7 @@ def map_maven_binary_and_source(package_url, pipelines): package_url=source_package_url, package_content=PackageContentType.SOURCE_ARCHIVE, pipelines=pipelines, + reindex_metadata=reindex_metadata, ) if emsg: error += emsg @@ -428,6 +434,7 @@ def process_request(purl_str, **kwargs): addon_pipelines = kwargs.get('addon_pipelines', []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + try: package_url = PackageURL.from_string(purl_str) except ValueError as e: @@ -436,7 +443,8 @@ def process_request(purl_str, **kwargs): has_version = bool(package_url.version) if has_version: - error = map_maven_binary_and_source(package_url, pipelines) + reindex_metadata=kwargs.get("reindex_metadata", False) + error = map_maven_binary_and_source(package_url, pipelines, reindex_metadata=reindex_metadata) else: error = map_maven_packages(package_url, pipelines) diff --git a/packagedb/api.py b/packagedb/api.py index e7560c8f..6fa9b6e3 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -1030,6 +1030,73 @@ def _reindex_package(package, reindexed_packages, **kwargs): serializer = IndexPackagesResponseSerializer(response_data, context={'request': request}) return Response(serializer.data) + @extend_schema( + parameters=[ + OpenApiParameter('purl', str, 'query', description='PackageURL', required=True), + ], + responses={200:PackageAPISerializer()}, + ) + @action(detail=False, methods=['get'], serializer_class=CollectPackageSerializer) + def reindex_metadata(self, request, *args, **kwargs): + """ + Collect or recollect the package metadata of a ``PURL`` string. + Also recollects all packages in the set of the PURL. + + If the PURL does exist, calling thios endpoint with re-collect, re-store and return the + Package metadata immediately, + + If the package does not exist in the database this call does nothing. + NOTE: this WILL NOT re-run scan and indexing in the background in contrast with the /collect + and collect/index_packages endpoints. + + **Request example**:: + + /api/collect/reindex_metadata/?purl=pkg:npm/foo@0.0.7 + + """ + serializer = self.serializer_class(data=request.query_params) + if not serializer.is_valid(): + return Response( + {'errors': serializer.errors}, + status=status.HTTP_400_BAD_REQUEST, + ) + + validated_data = serializer.validated_data + purl = validated_data.get('purl') + + lookups = purl_to_lookups(purl) + packages = Package.objects.filter(**lookups) + if packages.count() == 0: + return Response( + {'status': f'Not recollecting: Package does not exist for {purl}'}, + status=status.HTTP_400_BAD_REQUEST, + ) + + # Pass to only reindex_metadata downstream + kwargs["reindex_metadata"] = True + # here we have a package(s) matching our purl and we want to recollect metadata live + try: + errors = priority_router.process(purl, **kwargs) + except NoRouteAvailable: + message = { + 'status': f'cannot fetch Package data for {purl}: no available handler' + } + return Response(message, status=status.HTTP_400_BAD_REQUEST) + + lookups = purl_to_lookups(purl) + packages = Package.objects.filter(**lookups) + if packages.count() == 0: + message = {} + if errors: + message = { + 'status': f'error(s) occurred when fetching metadata for {purl}: {errors}' + } + return Response(message) + + serializer = PackageAPISerializer(packages, many=True, context={'request': request}) + return Response(serializer.data) + + class PurlValidateViewSet(viewsets.ViewSet): """ diff --git a/packagedb/models.py b/packagedb/models.py index f1069e45..9e2ade24 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -1211,13 +1211,17 @@ def __str__(self): def make_relationship( - from_package, to_package, relationship + from_package, to_package, relationship, ): - return PackageRelation.objects.create( + """ + Create and return the from/to package relathionship if it does exists. + """ + pkg, _created = PackageRelation.objects.get_or_create( from_package=from_package, to_package=to_package, relationship=relationship, ) + return pkg class PackageWatch(models.Model):