Skip to content

Commit

Permalink
feat: migrate website storage blobs (#431)
Browse files Browse the repository at this point in the history
  • Loading branch information
vncsna authored Oct 6, 2023
1 parent 02c6507 commit 3a677cc
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions scripts/migrations/20231006_migrate_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
from os import getenv
from pathlib import Path
from re import findall

import requests
from google.cloud import storage

bucket_name = getenv("BUCKET_NAME")
source_path = Path(getenv("SOURCE_PATH", "."))

url_prefix = "https://basedosdados-static.s3.us-east-2.amazonaws.com/"
url_prefix_exp = r'(https:\/\/basedosdados-static\.s3\.us-east-2\.amazonaws\.com\/[^"]*)'


def run():
"""
Steps to execute:
- Set the environment variables
- Run the script
"""

client = storage.Client()
bucket = client.bucket(bucket_name)

def download_and_upload(url, urlpath: Path, filepath: Path):
filepath.parent.mkdir(parents=True, exist_ok=True)
if not filepath.exists():
response = requests.get(url)
if response.status_code == 200:
with filepath.open("wb") as f:
f.write(response.content)
if not bucket.blob(urlpath).exists():
blob = bucket.blob(urlpath)
blob.upload_from_filename(filepath)

for path in source_path.glob("**/*.html"):
for url in findall(url_prefix_exp, path.read_text()):
urlpath = url.replace(url_prefix, "").lower()
filepath = Path(".").resolve() / ".blobs" / urlpath
download_and_upload(url, urlpath, filepath)


if __name__ == "__main__":
run()

0 comments on commit 3a677cc

Please sign in to comment.