From 32b10351c0339e433a95a3bb551395f8942a99f3 Mon Sep 17 00:00:00 2001 From: Andrew Pollock Date: Thu, 12 Sep 2024 17:15:27 +1000 Subject: [PATCH] feat: use GCS object update time instead of creation time (#2454) This should achieve the same outcome and is easier to manipulate (it doesn't require object recreation) and so should make triggering a reimport of specific records a little more efficient (doesn't require downloading and reuploading the object to reset the _creation_ time) Tested in Staging: Before: ``` $ gsutil stat gs://osv-test-cve-osv-conversion/osv-output/CVE-2024-7067.json gs://osv-test-cve-osv-conversion/osv-output/CVE-2024-7067.json: Creation time: Fri, 26 Jul 2024 16:53:59 GMT Update time: Wed, 31 Jul 2024 03:52:06 GMT Storage class: STANDARD Content-Length: 3073 Content-Type: application/json Metadata: goog-reserved-file-mtime:1722394200 Hash (crc32c): Lo3Xwg== Hash (md5): rdI8478THJQOVCgblWW9UQ== ETag: COmK0tyVxYcDED4= Generation: 1722012839216489 Metageneration: 62 ``` After: ``` $ gsutil stat gs://osv-test-cve-osv-conversion/osv-output/CVE-2024-7067.json gs://osv-test-cve-osv-conversion/osv-output/CVE-2024-7067.json: Creation time: Fri, 26 Jul 2024 16:53:59 GMT Update time: Tue, 06 Aug 2024 07:07:12 GMT Storage class: STANDARD Content-Length: 3073 Content-Type: application/json Metadata: goog-reserved-file-mtime:1722394200 Hash (crc32c): Lo3Xwg== Hash (md5): rdI8478THJQOVCgblWW9UQ== ETag: COmK0tyVxYcDED8= Generation: 1722012839216489 Metageneration: 63 ``` I have a sneaking suspicion this may facilitate the intentional reimporting of CVEs that get updated when one of their constituent parts changes (the fact that the original creation and update times were already divergent surprised me) This also requires `combine-to-osv` to switch back to using `gsutil rsync` instead of `gcloud storage rsync` due to a subtle difference in the treatment of the GCS object modification time for otherwise unchanged objects. (see #2196 for additional context) --------- Co-authored-by: Rex P <106129829+another-rex@users.noreply.github.com> --- docker/importer/importer.py | 3 ++- tools/datafix/reimport_gcs_record.py | 18 +++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/docker/importer/importer.py b/docker/importer/importer.py index 371a3bbbee3..f3654efe87e 100755 --- a/docker/importer/importer.py +++ b/docker/importer/importer.py @@ -463,7 +463,8 @@ def convert_blob_to_vuln(blob: storage.Blob) -> Optional[Tuple[str, str]]: if not _is_vulnerability_file(source_repo, blob.name): return None if not ignore_last_import_time and \ - not blob.time_created > utc_last_update_date: + blob.updated is not None and \ + not blob.updated > utc_last_update_date: return None logging.info('Bucket entry triggered for %s/%s', source_repo.bucket, diff --git a/tools/datafix/reimport_gcs_record.py b/tools/datafix/reimport_gcs_record.py index defa67c0572..41270fc23d7 100755 --- a/tools/datafix/reimport_gcs_record.py +++ b/tools/datafix/reimport_gcs_record.py @@ -114,26 +114,19 @@ def bucket_for_source(client: datastore.Client, source: str) -> str: return result[0]['bucket'] -def reset_object_creation(bucket_name: str, - blob_name: str, - tmpdir="/tmp") -> None: +def reset_object_modification(bucket_name: str, blob_name: str) -> None: """Resets a GCS object's creation time. - Copies the object locally and uploads it again. + Makes a no-op patch ("gcloud object storage objects update" equivalent) Args: bucket_name: the name of the GCS bucket. blob_name: the name of the object in the bucket. - tmpdir: a preexisting directory in the local filesystem to copy the object - to/from. """ - local_tmp_file = os.path.join(tmpdir, os.path.basename(blob_name)) gcs_client = storage.Client() bucket = gcs_client.bucket(bucket_name) blob = bucket.blob(blob_name) - blob.download_to_filename(local_tmp_file) - blob.upload_from_filename(local_tmp_file, retry=retry.DEFAULT_RETRY) - os.unlink(local_tmp_file) + blob.patch(retry=retry.DEFAULT_RETRY) def main() -> None: @@ -204,11 +197,10 @@ def main() -> None: print(f"Skipping {bug['db_id']}, got {e}\n") continue if args.verbose: - print(f"Resetting creation time for {bug_in_gcs['uri']}") + print(f"Resetting modification time for {bug_in_gcs['uri']}") if not args.dryrun: try: - reset_object_creation(bug_in_gcs["bucket"], bug_in_gcs["path"], - args.tmpdir) + reset_object_modification(bug_in_gcs["bucket"], bug_in_gcs["path"]) except NotFound as e: if args.verbose: print(f"Skipping, got {e}\n")