Skip to content

Commit

Permalink
Add pipeline to advertise scancode.io scans
Browse files Browse the repository at this point in the history
Signed-off-by: Keshav Priyadarshi <[email protected]>
  • Loading branch information
keshav-space committed Nov 29, 2024
1 parent 12c30d3 commit 6d01e09
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 2 deletions.
97 changes: 97 additions & 0 deletions fedcode/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# FederatedCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/federatedcode for support or download.
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
#

import logging
from datetime import datetime
from datetime import timezone
from timeit import default_timer as timer

from aboutcode.pipeline import BasePipeline
from aboutcode.pipeline import humanize_time

module_logger = logging.getLogger(__name__)


class classproperty(object):
def __init__(self, fget):
self.fget = fget

def __get__(self, owner_self, owner_cls):
return self.fget(owner_cls)


class FederatedCodePipeline(BasePipeline):
pipeline_id = None # Unique Pipeline ID

def on_failure(self):
"""
Tasks to run in the event that pipeline execution fails.
Implement cleanup or other tasks that need to be performed
on pipeline failure, such as:
- Removing cloned repositories.
- Deleting downloaded archives.
"""
pass

def execute(self):
"""Execute each steps in the order defined on this pipeline class."""
self.log(f"Pipeline [{self.pipeline_name}] starting")

steps = self.pipeline_class.get_steps(groups=self.selected_groups)
steps_count = len(steps)
pipeline_start_time = timer()

for current_index, step in enumerate(steps, start=1):
step_name = step.__name__

if self.selected_steps and step_name not in self.selected_steps:
self.log(f"Step [{step_name}] skipped")
continue

self.set_current_step(f"{current_index}/{steps_count} {step_name}")
self.log(f"Step [{step_name}] starting")
step_start_time = timer()

try:
step(self)
except Exception as exception:
self.log("Pipeline failed")
on_failure_start_time = timer()
self.log(f"Running [on_failure] tasks")
self.on_failure()
on_failure_run_time = timer() - on_failure_start_time
self.log(f"Completed [on_failure] tasks in {humanize_time(on_failure_run_time)}")

return 1, self.output_from_exception(exception)

step_run_time = timer() - step_start_time
self.log(f"Step [{step_name}] completed in {humanize_time(step_run_time)}")

self.set_current_step("") # Reset the `current_step` field on completion
pipeline_run_time = timer() - pipeline_start_time
self.log(f"Pipeline completed in {humanize_time(pipeline_run_time)}")

return 0, ""

def log(self, message, level=logging.INFO):
"""Log the given `message` to the current module logger and execution_log."""
now_local = datetime.now(timezone.utc).astimezone()
timestamp = now_local.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
message = f"{timestamp} {message}"
module_logger.log(level, message)
self.append_to_log(message)

@classproperty
def pipeline_id(cls):
"""Return unique pipeline_id set in cls.pipeline_id"""

if cls.pipeline_id is None or cls.pipeline_id == "":
raise NotImplementedError("pipeline_id is not defined or is empty")
return cls.pipeline_id
113 changes: 113 additions & 0 deletions fedcode/pipelines/sync_scancode_scans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# FederatedCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/federatedcode for support or download.
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
#

from pathlib import Path
from traceback import format_exc as traceback_format_exc

from aboutcode.pipeline import LoopProgress

from fedcode.models import Package
from fedcode.models import Repository
from fedcode.pipelines import FederatedCodePipeline
from fedcode.pipes import utils


class SyncScanCodeScans(FederatedCodePipeline):
"""Sync Package scans from FederatedCode git repositories."""

pipeline_id = "sync_scancode_scans"

@classmethod
def steps(cls):
return (
cls.get_git_repos,
cls.sync_scan_repositories,
)

def get_git_repos(self):
self.git_repos = Repository.objects.all()

def sync_scan_repositories(self):
repositories_count = self.git_repos.count()
self.log(f"Syncing package scans from {repositories_count:,d} repositories")

synced_package_scan_count = 0
progress = LoopProgress(total_iterations=repositories_count, logger=self.log)
for repo in progress.iter(self.git_repos.iterator(chunk_size=2000)):
repository, _ = Repository.objects.get_or_create(url=repo)
repository.git_repo_obj.remotes.origin.pull()
synced_package_scan_count += sync_scancodeio_scan(
repository=repository,
logger=self.log,
)

self.log(f"Successfully synced {synced_package_scan_count:,d} package scans")


def sync_scancodeio_scan(repository, logger):
repo = repository.git_repo_obj
latest_commit_hash = repo.head.commit.hexsha
latest_commit = repo.commit(latest_commit_hash)

if last_commit_hash := repository.last_imported_commit:
last_imported_commit = repo.commit(last_commit_hash)
diffs = last_imported_commit.diff(latest_commit)
scans = [item for item in diffs if item.a_path.endswith("scancodeio.json")]
scan_count = sync_scan_from_diff(diffs=scans, repository=repository, logger=logger)
else:
scan_count = sync_all_scan(repository=repository, logger=logger)

repository.last_imported_commit = latest_commit_hash
repository.save()

return scan_count


def sync_scan_from_diff(diffs, repository, logger):
scans = [
item
for item in diffs
if item.a_path.endswith("scancodeio.json") or item.b_path.endswith("scancodeio.json")
]
scan_count = len(scans)

logger(f"Syncing {scan_count:,d} package scan from {repository.url}")
progress = LoopProgress(total_iterations=scan_count, logger=logger)
for scan in progress.iter(scans):
change_type = scan.change_type
if change_type in ("A", "M", "R"):
scan_path = scan.b_path
action = utils.create_note
elif change_type == "D":
scan_path = scan.a_path
action = utils.delete_note

purl = utils.package_metadata_path_to_purl(path=Path(scan_path), version=False)
package, _ = Package.objects.get_or_create(purl=str(purl), service=repository.admin)
note = utils.get_scan_note(path=Path(scan_path))
action(pkg=package, note_dict=note)
return scan_count


def sync_all_scan(repository, logger):
repo = repository.git_repo_obj
root = Path(repo.working_dir)
scan_count = sum(1 for _ in root.rglob("scancodeio.json"))

scans = root.rglob("scancodeio.json")
logger(f"Syncing {scan_count:,d} package scan from {repo.remotes.origin.url}")

progress = LoopProgress(total_iterations=scan_count, logger=logger)
for scan in progress.iter(scans):
relative_path = scan.relative_to(root)
purl = utils.package_metadata_path_to_purl(relative_path, version=False)
package, _ = Package.objects.get_or_create(purl=str(purl), service=repository.admin)
note = utils.get_scan_note(path=relative_path)
utils.create_note(pkg=package, note_dict=note)
return scan_count
74 changes: 74 additions & 0 deletions fedcode/pipes/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# FederatedCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/federatedcode for support or download.
# See https://aboutcode.org for more information about AboutCode.org OSS projects.
#

import saneyaml
from packageurl import PackageURL

from fedcode.activitypub import Activity
from fedcode.activitypub import CreateActivity
from fedcode.activitypub import DeleteActivity
from fedcode.models import Note


def create_note(pkg, note_dict):
note, _ = Note.objects.get_or_create(acct=pkg.acct, content=saneyaml.dump(note_dict))
pkg.notes.add(note)
create_activity = CreateActivity(actor=pkg.to_ap, object=note.to_ap)
Activity.federate(
targets=pkg.followers_inboxes,
body=create_activity.to_ap(),
key_id=pkg.key_id,
)


def delete_note(pkg, note_dict):
note = Note.objects.get(acct=pkg.acct, content=saneyaml.dump(note_dict))
note_ap = note.to_ap
note.delete()
pkg.notes.remove(note)

deleted_activity = DeleteActivity(actor=pkg.to_ap, object=note_ap)
Activity.federate(
targets=pkg.followers_inboxes,
body=deleted_activity.to_ap,
key_id=pkg.key_id,
)


def package_metadata_path_to_purl(path, version=True):
"""
Return PURL from relative metadata path.
"""
parts = path.parts
if len(parts) < 4:
ValueError("Not a valid package metadata path.")

purl = f"pkg:{'/'.join(parts[:-2])}"
if version:
purl = f"{purl}@{parts[-2]}"
return PackageURL.from_string(purl=purl)


def get_scan_note(path):
"""Return Note for Package scan."""
purl = package_metadata_path_to_purl(path=path)

# TODO: Use tool-alias.yml to get tool for corresponding tool
# for scan https://github.com/aboutcode-org/federatedcode/issues/24
return {
"purl": str(purl),
"scans": [
{
"tool": "pkg:pypi/scancode-toolkit",
"file_name": "scancodeio.json",
},
],
}
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
aboutcode.pipeline==0.1.0
aboutcode-toolkit==10.1.0
alabaster==0.7.13
anyio==4.1.0
Expand Down
12 changes: 10 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,13 @@ install_requires =
django-rest-framework>=0.1.0
djangorestframework>=3.14.0
django-environ>=0.10.0
django-ninja>=1.2.1
gunicorn>=21.2.0
GitPython>=3.1.31
requests>=2.31.0
saneyaml>=0.6.0
#
httpx>=0.24.1
http-message-signatures>=0.4.4
pydantic>=2.8.2

anyio>=4.1.0
asgiref>=3.7.2
Expand All @@ -85,7 +83,10 @@ install_requires =
packageurl-python>=0.11.1
packaging>=23.1
pathspec>=0.11.2

#??
Pillow>=9.5.0

platformdirs>=3.10.0
pluggy>=1.0.0
pycparser>=2.21
Expand All @@ -105,6 +106,13 @@ install_requires =
unidiff>=0.7.5
urllib3>=2.0.3
wrapt>=1.15.0

#schema
django-ninja>=1.2.1
pydantic>=2.8.2

#pipeline
aboutcode.pipeline>=0.1.0


[options.extras_require]
Expand Down

0 comments on commit 6d01e09

Please sign in to comment.