Skip to content

Commit

Permalink
Ensure we can reindex metadata #496 #494
Browse files Browse the repository at this point in the history
* Optionally override package data with new, non-empty manifest data
* Expose new experimental API endpoint extra to recollect manifest data
  with a rescan at collect/reindex_metadata with this override
  capability
* Enable this for Maven only for now

Reference: #496
Reference: #494
Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Jul 8, 2024
1 parent 397e3b3 commit f5b7641
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 16 deletions.
20 changes: 14 additions & 6 deletions minecode/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,19 +219,27 @@ def merge_packages(existing_package, new_package_data, replace=False):
return updated_fields


def merge_or_create_package(scanned_package, visit_level):
def merge_or_create_package(scanned_package, visit_level, override=False):
"""
Update Package from `scanned_package` instance if `visit_level` is greater
Update Package from ``scanned_package`` instance if `visit_level` is greater
than the mining level of the existing package.
If `scanned_package` does not exist in the PackageDB, create a new entry in
the PackageDB for `scanned_package`.
If ``scanned_package`` does not exist in the PackageDB, create a new entry in
the PackageDB for ``scanned_package``.
If ``override`` is True, then all existing empty values of the PackageDB package are replaced by
a non-empty value of the provided override.
"""
created = False
merged = False
package = None
map_error = ''

mining_level = visit_level
if override:
# this will force the data override
visit_level =+1

if not isinstance(scanned_package, PackageData):
msg = 'Not a ScanCode PackageData type:' + repr(scanned_package)
map_error += msg + '\n'
Expand Down Expand Up @@ -297,7 +305,7 @@ def merge_or_create_package(scanned_package, visit_level):
# package and the existing package, the existing package parties should be
# deleted first and then the new package's parties added.

stored_package.mining_level = visit_level
stored_package.mining_level = mining_level

if updated_fields:
data = {
Expand Down Expand Up @@ -335,7 +343,7 @@ def merge_or_create_package(scanned_package, visit_level):
filename=fileutils.file_name(package_uri),
# TODO: update the PackageDB model
release_date=scanned_package.release_date,
mining_level=visit_level,
mining_level=mining_level,
type=scanned_package.type,
namespace=scanned_package.namespace,
name=scanned_package.name,
Expand Down
24 changes: 16 additions & 8 deletions minecode/visitors/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,13 @@ def merge_ancestors(ancestor_pom_texts, package):
return package


def map_maven_package(package_url, package_content, pipelines):
def map_maven_package(package_url, package_content, pipelines, reindex_metadata=False):
"""
Add a maven `package_url` to the PackageDB.
Return an error string if errors have occured in the process.
if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package.
"""
from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package

Expand Down Expand Up @@ -307,20 +309,22 @@ def map_maven_package(package_url, package_content, pipelines):
sha1 = get_package_sha1(package)
if sha1:
package.sha1 = sha1
db_package, _, _, _ = merge_or_create_package(package, visit_level=50)
override = reindex_metadata
db_package, _, _, _ = merge_or_create_package(package, visit_level=50, override=override)
else:
msg = f'Failed to retrieve JAR: {package_url}'
error += msg + '\n'
logger.error(msg)

# Submit package for scanning
if db_package:
add_package_to_scan_queue(db_package, pipelines)

if not reindex_metadata:
# Submit package for scanning
if db_package:
add_package_to_scan_queue(package=db_package, pipelines=pipelines)

return db_package, error


def map_maven_binary_and_source(package_url, pipelines):
def map_maven_binary_and_source(package_url, pipelines, reindex_metadata=False):
"""
Get metadata for the binary and source release of the Maven package
`package_url` and save it to the PackageDB.
Expand All @@ -332,6 +336,7 @@ def map_maven_binary_and_source(package_url, pipelines):
package_url=package_url,
package_content=PackageContentType.BINARY,
pipelines=pipelines,
reindex_metadata=reindex_metadata,
)
if emsg:
error += emsg
Expand All @@ -342,6 +347,7 @@ def map_maven_binary_and_source(package_url, pipelines):
package_url=source_package_url,
package_content=PackageContentType.SOURCE_ARCHIVE,
pipelines=pipelines,
reindex_metadata=reindex_metadata,
)
if emsg:
error += emsg
Expand Down Expand Up @@ -428,6 +434,7 @@ def process_request(purl_str, **kwargs):
addon_pipelines = kwargs.get('addon_pipelines', [])
pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines)


try:
package_url = PackageURL.from_string(purl_str)
except ValueError as e:
Expand All @@ -436,7 +443,8 @@ def process_request(purl_str, **kwargs):

has_version = bool(package_url.version)
if has_version:
error = map_maven_binary_and_source(package_url, pipelines)
reindex_metadata=kwargs.get("reindex_metadata", False)
error = map_maven_binary_and_source(package_url, pipelines, reindex_metadata=reindex_metadata)
else:
error = map_maven_packages(package_url, pipelines)

Expand Down
67 changes: 67 additions & 0 deletions packagedb/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,73 @@ def _reindex_package(package, reindexed_packages, **kwargs):
serializer = IndexPackagesResponseSerializer(response_data, context={'request': request})
return Response(serializer.data)

@extend_schema(
parameters=[
OpenApiParameter('purl', str, 'query', description='PackageURL', required=True),
],
responses={200:PackageAPISerializer()},
)
@action(detail=False, methods=['get'], serializer_class=CollectPackageSerializer)
def reindex_metadata(self, request, *args, **kwargs):
"""
Collect or recollect the package metadata of a ``PURL`` string.
Also recollects all packages in the set of the PURL.
If the PURL does exist, calling thios endpoint with re-collect, re-store and return the
Package metadata immediately,
If the package does not exist in the database this call does nothing.
NOTE: this WILL NOT re-run scan and indexing in the background in contrast with the /collect
and collect/index_packages endpoints.
**Request example**::
/api/collect/reindex_metadata/?purl=pkg:npm/[email protected]
"""
serializer = self.serializer_class(data=request.query_params)
if not serializer.is_valid():
return Response(
{'errors': serializer.errors},
status=status.HTTP_400_BAD_REQUEST,
)

validated_data = serializer.validated_data
purl = validated_data.get('purl')

lookups = purl_to_lookups(purl)
packages = Package.objects.filter(**lookups)
if packages.count() == 0:
return Response(
{'status': f'Not recollecting: Package does not exist for {purl}'},
status=status.HTTP_400_BAD_REQUEST,
)

# Pass to only reindex_metadata downstream
kwargs["reindex_metadata"] = True
# here we have a package(s) matching our purl and we want to recollect metadata live
try:
errors = priority_router.process(purl, **kwargs)
except NoRouteAvailable:
message = {
'status': f'cannot fetch Package data for {purl}: no available handler'
}
return Response(message, status=status.HTTP_400_BAD_REQUEST)

lookups = purl_to_lookups(purl)
packages = Package.objects.filter(**lookups)
if packages.count() == 0:
message = {}
if errors:
message = {
'status': f'error(s) occurred when fetching metadata for {purl}: {errors}'
}
return Response(message)

serializer = PackageAPISerializer(packages, many=True, context={'request': request})
return Response(serializer.data)



class PurlValidateViewSet(viewsets.ViewSet):
"""
Expand Down
8 changes: 6 additions & 2 deletions packagedb/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,13 +1211,17 @@ def __str__(self):


def make_relationship(
from_package, to_package, relationship
from_package, to_package, relationship,
):
return PackageRelation.objects.create(
"""
Create and return the from/to package relathionship if it does exists.
"""
pkg, _created = PackageRelation.objects.get_or_create(
from_package=from_package,
to_package=to_package,
relationship=relationship,
)
return pkg


class PackageWatch(models.Model):
Expand Down

0 comments on commit f5b7641

Please sign in to comment.