diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 32d771176..78a3c2494 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -98,7 +98,7 @@ jobs: publish_docker: needs: build runs-on: ubuntu-latest - if: github.event_name == 'push' + if: github.ref == 'refs/heads/dev' && github.event_name == 'push' steps: - name: 'Checkout source code' uses: actions/checkout@v2 @@ -114,15 +114,11 @@ jobs: - name: Push image run: | IMAGE_ID=docker.pkg.github.com/${{ github.repository }}/$IMAGE_NAME + # Change all uppercase to lowercase IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') - # Strip git ref prefix from version - VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,') - - # Use Docker `dev` tag - [ "$VERSION" == "dev"] && VERSION=dev + # Publish with the `dev` label echo IMAGE_ID=$IMAGE_ID - echo VERSION=$VERSION - docker tag $IMAGE_NAME $IMAGE_ID:$VERSION - docker push $IMAGE_ID:$VERSION \ No newline at end of file + docker tag $IMAGE_NAME $IMAGE_ID:dev + docker push $IMAGE_ID:dev \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index b7904a062..899e3cdab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - ## [Unreleased] +## [0.2.8] - 2021-10-29 +### Changed +- CI/CD: `dev` image is now only published on push to the `dev` branch +- Docker: + - updated registry links to use the new `ghcr.io` domain + - `run.sh` now also accepts the `-t` option. When run in standard mode, it will only spin up the `viadot_jupyter_lab` service. + When ran with `-t dev`, it will also spin up `viadot_testing` and `viadot_docs` containers. + +### Fixed +- ADLSToAzureSQL - fixed path parameter issue. + + ## [0.2.7] - 2021-10-04 ### Added - Added `SQLiteQuery` task @@ -21,8 +32,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Changed CI/CD algorithm - the `latest` Docker image is now only updated on release and is the same exact image as the latest release - the `dev` image is released only on pushes and PRs to the `dev` branch (so dev branch = dev image) - -### Changed - Modified `ADLSToAzureSQL` - *read_sep* and *write_sep* parameters added to the flow. ### Fixed diff --git a/README.md b/README.md index dcfb62c7e..b0a304dff 100644 --- a/README.md +++ b/README.md @@ -43,38 +43,71 @@ insert_task = SQLiteInsert() insert_task.run(table_name=TABLE_NAME, dtypes=dtypes, db_path=database_path, df=df, if_exists="replace") ``` +## Set up + +__Note__: If you're running on Unix, after cloning the repo, you may need to grant executable privileges to the `update.sh` and `run.sh` scripts: +``` +sudo chmod +x viadot/docker/update.sh && \ +sudo chmod +x viadot/docker/run.sh +``` + +### a) user +Clone the `main` branch, enter the `docker` folder, and set up the environment: +``` +git clone https://github.com/dyvenia/viadot.git && \ +cd viadot/docker && \ +./update.sh +``` + +Run the enviroment: +``` +./run.sh +``` + +### b) developer +Clone the `dev` branch, enter the `docker` folder, and set up the environment: +``` +git clone -b dev https://github.com/dyvenia/viadot.git && \ +cd viadot/docker && \ +./update.sh -t dev +``` + +Run the enviroment: +``` +./run.sh -t dev +``` + +Install the library in development mode (repeat for the `viadot_jupyter_lab` container if needed): +``` +docker exec -it viadot_testing pip install -e . --user +``` + ## Running tests + To run tests, log into the container and run pytest: ``` -cd viadot/docker -run.sh docker exec -it viadot_testing bash pytest ``` ## Running flows locally + You can run the example flows from the terminal: ``` -run.sh docker exec -it viadot_testing bash FLOW_NAME=hello_world; python -m viadot.examples.$FLOW_NAME ``` -However, when developing, the easiest way is to use the provided Jupyter Lab container available at `http://localhost:9000/`. +However, when developing, the easiest way is to use the provided Jupyter Lab container available in the browser at `http://localhost:9000/`. ## How to contribute -1. Clone the release branch -2. Pull the docker env by running `viadot/docker/update.sh -t dev` -3. Run the env with `viadot/docker/run.sh` -4. Log into the dev container and install in development mode so that viadot will auto-install at each code change: -``` -docker exec -it viadot_testing bash -pip install -e . -``` -5. Edit and test your changes with `pytest` -6. Submit a PR. The PR should contain the following: + +1. Fork repository if you do not have write access +2. Set up locally +3. Test your changes with `pytest` +4. Submit a PR. The PR should contain the following: - new/changed functionality - tests for the changes - changes added to `CHANGELOG.md` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index dc5449a86..2acef9c63 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -2,16 +2,18 @@ version: "3" services: viadot: - image: viadot:dev + image: viadot:${IMAGE_TAG:-latest} container_name: viadot_testing + profiles: ["dev"] volumes: - ../:/home/viadot tty: true stdin_open: true restart: "always" viadot_docs: - image: viadot:dev + image: viadot:${IMAGE_TAG:-latest} container_name: viadot_docs + profiles: ["dev"] volumes: - ../:/home/viadot working_dir: /home/viadot @@ -20,8 +22,9 @@ services: command: "mkdocs serve" restart: "always" viadot_jupyter_lab: - image: viadot:dev + image: viadot:${IMAGE_TAG:-latest} container_name: viadot_jupyter_lab + profiles: ["dev", "user"] ports: - 9000:8888 volumes: diff --git a/docker/run.sh b/docker/run.sh index 0029fd07f..8a1671a10 100644 --- a/docker/run.sh +++ b/docker/run.sh @@ -1 +1,20 @@ -docker-compose up -d --force-recreate \ No newline at end of file +IMAGE_TAG=latest +PROFILE="user" + +while getopts t: flag +do + case "${flag}" in + t) IMAGE_TAG=${OPTARG} + case ${OPTARG} in + dev) PROFILE="dev";; + esac + ;; + esac +done + +IMAGE_TAG=$IMAGE_TAG docker-compose --profile $PROFILE up -d --force-recreate + +echo "" +echo "Press Enter to exit." + +read \ No newline at end of file diff --git a/docker/update.sh b/docker/update.sh index 6c9058931..75bb50ab2 100644 --- a/docker/update.sh +++ b/docker/update.sh @@ -1,4 +1,4 @@ -IMAGE_ID=docker.pkg.github.com/dyvenia/viadot/viadot +IMAGE_ID=ghcr.io/dyvenia/viadot/viadot IMAGE_TAG=latest @@ -10,7 +10,7 @@ do done -docker login https://docker.pkg.github.com +docker login ghcr.io docker pull $IMAGE_ID:$IMAGE_TAG docker tag $IMAGE_ID:$IMAGE_TAG viadot:$IMAGE_TAG docker image rm $IMAGE_ID:$IMAGE_TAG diff --git a/tests/integration/flows/test_adls_to_azure_sql.py b/tests/integration/flows/test_adls_to_azure_sql.py index 35123e16d..ff42c141b 100644 --- a/tests/integration/flows/test_adls_to_azure_sql.py +++ b/tests/integration/flows/test_adls_to_azure_sql.py @@ -1,10 +1,20 @@ from viadot.flows import ADLSToAzureSQL -def test_get_promoted_adls_path_file(): +def test_get_promoted_adls_path_csv_file(): adls_path_file = "raw/supermetrics/adls_ga_load_times_fr_test/2021-07-14T13%3A09%3A02.997357%2B00%3A00.csv" flow = ADLSToAzureSQL(name="test", adls_path=adls_path_file) promoted_path = flow.get_promoted_path(env="conformed") + assert ( + promoted_path + == "conformed/supermetrics/adls_ga_load_times_fr_test/2021-07-14T13%3A09%3A02.997357%2B00%3A00.csv" + ) + + +def test_get_promoted_adls_path_parquet_file(): + adls_path_file = "raw/supermetrics/adls_ga_load_times_fr_test/2021-07-14T13%3A09%3A02.997357%2B00%3A00.parquet" + flow = ADLSToAzureSQL(name="test", adls_path=adls_path_file) + promoted_path = flow.get_promoted_path(env="conformed") assert promoted_path == "conformed/supermetrics/adls_ga_load_times_fr_test.csv" diff --git a/tests/test_viadot.py b/tests/test_viadot.py index 870e0fe17..13b8146a1 100644 --- a/tests/test_viadot.py +++ b/tests/test_viadot.py @@ -2,4 +2,4 @@ def test_version(): - assert __version__ == "0.2.7" + assert __version__ == "0.2.9" diff --git a/viadot/__init__.py b/viadot/__init__.py index 6cd38b746..75cf7831c 100644 --- a/viadot/__init__.py +++ b/viadot/__init__.py @@ -1 +1 @@ -__version__ = "0.2.7" +__version__ = "0.2.9" diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py index 5f08f2489..e9e88d489 100644 --- a/viadot/flows/adls_to_azure_sql.py +++ b/viadot/flows/adls_to_azure_sql.py @@ -74,6 +74,11 @@ def df_to_csv_task(df, path: str, sep: str = "\t"): df.to_csv(path, sep=sep, index=False) +@task +def df_to_parquet_task(df, path: str): + df.to_parquet(path) + + class ADLSToAzureSQL(Flow): def __init__( self, @@ -125,7 +130,6 @@ def __init__( vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None. """ adls_path = adls_path.strip("/") - # Read parquet if adls_path.split(".")[-1] in ["csv", "parquet"]: self.adls_path = adls_path @@ -196,10 +200,26 @@ def get_promoted_path(self, env: str) -> str: return promoted_path + def create_to_file_task(self, df, file_type): + df_to_type = None + if file_type == "csv": + df_to_type = df_to_csv_task.bind( + df=df, + path=self.local_file_path, + sep=self.write_sep, + flow=self, + ) + else: + df_to_type = df_to_parquet_task.bind( + df=df, + path=self.local_file_path, + flow=self, + ) + return df_to_type + def gen_flow(self) -> Flow: - adls_raw_file_path = Parameter("adls_raw_file_path", default=self.adls_path) df = lake_to_df_task.bind( - path=adls_raw_file_path, + path=self.adls_path, sp_credentials_secret=self.adls_sp_credentials_secret, sep=self.read_sep, flow=self, @@ -217,12 +237,9 @@ def gen_flow(self) -> Flow: else: dtypes = self.dtypes - df_to_csv = df_to_csv_task.bind( - df=df, - path=self.local_file_path, - sep=self.write_sep, - flow=self, - ) + adls_file_type = self.adls_path.split(".")[-1] + df_to_type = self.create_to_file_task(df, adls_file_type) + promote_to_conformed_task.bind( from_path=self.local_file_path, to_path=self.adls_path_conformed, @@ -257,8 +274,8 @@ def gen_flow(self) -> Flow: ) # dtypes.set_upstream(download_json_file_task, flow=self) - promote_to_conformed_task.set_upstream(df_to_csv, flow=self) + promote_to_conformed_task.set_upstream(df_to_type, flow=self) # map_data_types_task.set_upstream(download_json_file_task, flow=self) - create_table_task.set_upstream(df_to_csv, flow=self) + create_table_task.set_upstream(df_to_type, flow=self) promote_to_operations_task.set_upstream(promote_to_conformed_task, flow=self) bulk_insert_task.set_upstream(create_table_task, flow=self) diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py index 794d258be..a9e051c59 100644 --- a/viadot/flows/supermetrics_to_adls.py +++ b/viadot/flows/supermetrics_to_adls.py @@ -346,3 +346,5 @@ def gen_flow(self) -> Flow: file_to_adls_task.set_upstream(df_to_file, flow=self) json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self) set_key_value(key=self.adls_dir_path, value=self.adls_file_path) + + shutil.rmtree(self.local_json_path)