diff --git a/scripts/migrations/20231006_migrate_storage.py b/scripts/migrations/20231006_migrate_storage.py new file mode 100644 index 00000000..fed2e90a --- /dev/null +++ b/scripts/migrations/20231006_migrate_storage.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +from os import getenv +from pathlib import Path +from re import findall + +import requests +from google.cloud import storage + +bucket_name = getenv("BUCKET_NAME") +source_path = Path(getenv("SOURCE_PATH", ".")) + +url_prefix = "https://basedosdados-static.s3.us-east-2.amazonaws.com/" +url_prefix_exp = r'(https:\/\/basedosdados-static\.s3\.us-east-2\.amazonaws\.com\/[^"]*)' + + +def run(): + """ + Steps to execute: + - Set the environment variables + - Run the script + """ + + client = storage.Client() + bucket = client.bucket(bucket_name) + + def download_and_upload(url, urlpath: Path, filepath: Path): + filepath.parent.mkdir(parents=True, exist_ok=True) + if not filepath.exists(): + response = requests.get(url) + if response.status_code == 200: + with filepath.open("wb") as f: + f.write(response.content) + if not bucket.blob(urlpath).exists(): + blob = bucket.blob(urlpath) + blob.upload_from_filename(filepath) + + for path in source_path.glob("**/*.html"): + for url in findall(url_prefix_exp, path.read_text()): + urlpath = url.replace(url_prefix, "").lower() + filepath = Path(".").resolve() / ".blobs" / urlpath + download_and_upload(url, urlpath, filepath) + + +if __name__ == "__main__": + run()