Merge pull request #182 from dyvenia/dev

Release 0.2.8
dyvenia · Oct 29, 2021 · 12de657 · 12de657
2 parents 1086f47 + 5d3709a
commit 12de657
Show file tree

Hide file tree

Showing 11 changed files with 135 additions and 46 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -98,7 +98,7 @@ jobs:
   publish_docker:
     needs: build
     runs-on: ubuntu-latest
-    if: github.event_name == 'push'
+    if: github.ref == 'refs/heads/dev' && github.event_name == 'push'
     steps:
       - name: 'Checkout source code'
         uses: actions/checkout@v2
@@ -114,15 +114,11 @@ jobs:
       - name: Push image
         run: |
           IMAGE_ID=docker.pkg.github.com/${{ github.repository }}/$IMAGE_NAME
+          
           # Change all uppercase to lowercase
           IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
-          # Strip git ref prefix from version
-          VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
-
-          # Use Docker `dev` tag
-          [ "$VERSION" == "dev"] && VERSION=dev
 
+          # Publish with the `dev` label
           echo IMAGE_ID=$IMAGE_ID
-          echo VERSION=$VERSION
-          docker tag $IMAGE_NAME $IMAGE_ID:$VERSION
-          docker push $IMAGE_ID:$VERSION
+          docker tag $IMAGE_NAME $IMAGE_ID:dev
+          docker push $IMAGE_ID:dev
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,20 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-
 ## [Unreleased]
 
+## [0.2.8] - 2021-10-29
+### Changed
+- CI/CD: `dev` image is now only published on push to the `dev` branch
+- Docker: 
+  - updated registry links to use the new `ghcr.io` domain
+  - `run.sh` now also accepts the `-t` option. When run in standard mode, it will only spin up the `viadot_jupyter_lab` service.
+  When ran with `-t dev`, it will also spin up `viadot_testing` and `viadot_docs` containers.
+
+### Fixed
+- ADLSToAzureSQL - fixed path parameter issue.
+
+
 ## [0.2.7] - 2021-10-04
 ### Added
 - Added `SQLiteQuery` task
@@ -21,8 +32,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Changed CI/CD algorithm
   - the `latest` Docker image is now only updated on release and is the same exact image as the latest release
   - the `dev` image is released only on pushes and PRs to the `dev` branch (so dev branch = dev image)
-
-### Changed
 - Modified `ADLSToAzureSQL` - *read_sep* and *write_sep* parameters added to the flow.
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -43,38 +43,71 @@ insert_task = SQLiteInsert()
 insert_task.run(table_name=TABLE_NAME, dtypes=dtypes, db_path=database_path, df=df, if_exists="replace")
 ```
 
+## Set up
+
+__Note__: If you're running on Unix, after cloning the repo, you may need to grant executable privileges to the `update.sh` and `run.sh` scripts: 
+```
+sudo chmod +x viadot/docker/update.sh && \
+sudo chmod +x viadot/docker/run.sh
+```
+
+### a) user
+Clone the `main` branch, enter the `docker` folder, and set up the environment:
+```
+git clone https://github.com/dyvenia/viadot.git && \
+cd viadot/docker && \
+./update.sh
+```
+
+Run the enviroment:
+```
+./run.sh
+```
+
+### b) developer
+Clone the `dev` branch, enter the `docker` folder, and set up the environment:
+```
+git clone -b dev https://github.com/dyvenia/viadot.git && \
+cd viadot/docker && \
+./update.sh -t dev
+```
+
+Run the enviroment:
+```
+./run.sh -t dev
+```
+
+Install the library in development mode (repeat for the `viadot_jupyter_lab` container if needed):
+```
+docker exec -it viadot_testing pip install -e . --user
+```
+
 
 ## Running tests
+
 To run tests, log into the container and run pytest:
 ```
-cd viadot/docker
-run.sh
 docker exec -it viadot_testing bash
 pytest
 ```
 
 ## Running flows locally
+
 You can run the example flows from the terminal:
 ```
-run.sh
 docker exec -it viadot_testing bash
 FLOW_NAME=hello_world; python -m viadot.examples.$FLOW_NAME
 ```
 
-However, when developing, the easiest way is to use the provided Jupyter Lab container available at `http://localhost:9000/`.
+However, when developing, the easiest way is to use the provided Jupyter Lab container available in the browser at `http://localhost:9000/`.
 
 
 ## How to contribute
-1. Clone the release branch 
-2. Pull the docker env by running `viadot/docker/update.sh -t dev`
-3. Run the env with `viadot/docker/run.sh`
-4. Log into the dev container and install in development mode so that viadot will auto-install at each code change: 
-```
-docker exec -it viadot_testing bash
-pip install -e .
-```
-5. Edit and test your changes with `pytest`
-6. Submit a PR. The PR should contain the following:
+
+1. Fork repository if you do not have write access
+2. Set up locally
+3. Test your changes with `pytest`
+4. Submit a PR. The PR should contain the following:
 - new/changed functionality
 - tests for the changes
 - changes added to `CHANGELOG.md`

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -2,16 +2,18 @@ version: "3"
 
 services:
   viadot:
-    image: viadot:dev
+    image: viadot:${IMAGE_TAG:-latest}
     container_name: viadot_testing
+    profiles: ["dev"]
     volumes:
       - ../:/home/viadot
     tty: true
     stdin_open: true
     restart: "always"
   viadot_docs:
-    image: viadot:dev
+    image: viadot:${IMAGE_TAG:-latest}
     container_name: viadot_docs
+    profiles: ["dev"]
     volumes:
       - ../:/home/viadot
     working_dir: /home/viadot
@@ -20,8 +22,9 @@ services:
     command: "mkdocs serve"
     restart: "always"
   viadot_jupyter_lab:
-    image: viadot:dev
+    image: viadot:${IMAGE_TAG:-latest}
     container_name: viadot_jupyter_lab
+    profiles: ["dev", "user"]
     ports:
       - 9000:8888
     volumes:

diff --git a/docker/run.sh b/docker/run.sh
@@ -1 +1,20 @@
-docker-compose up -d --force-recreate
+IMAGE_TAG=latest
+PROFILE="user"
+
+while getopts t: flag
+do
+    case "${flag}" in
+        t) IMAGE_TAG=${OPTARG}
+            case ${OPTARG} in 
+                dev) PROFILE="dev";;
+            esac
+        ;;
+    esac
+done
+
+IMAGE_TAG=$IMAGE_TAG docker-compose --profile $PROFILE up -d --force-recreate
+
+echo ""
+echo "Press Enter to exit."
+
+read
diff --git a/docker/update.sh b/docker/update.sh
@@ -1,4 +1,4 @@
-IMAGE_ID=docker.pkg.github.com/dyvenia/viadot/viadot
+IMAGE_ID=ghcr.io/dyvenia/viadot/viadot
 IMAGE_TAG=latest
 
 
@@ -10,7 +10,7 @@ do
 done
 
 
-docker login https://docker.pkg.github.com
+docker login ghcr.io
 docker pull $IMAGE_ID:$IMAGE_TAG
 docker tag $IMAGE_ID:$IMAGE_TAG viadot:$IMAGE_TAG
 docker image rm $IMAGE_ID:$IMAGE_TAG

diff --git a/tests/integration/flows/test_adls_to_azure_sql.py b/tests/integration/flows/test_adls_to_azure_sql.py
@@ -1,10 +1,20 @@
 from viadot.flows import ADLSToAzureSQL
 
 
-def test_get_promoted_adls_path_file():
+def test_get_promoted_adls_path_csv_file():
     adls_path_file = "raw/supermetrics/adls_ga_load_times_fr_test/2021-07-14T13%3A09%3A02.997357%2B00%3A00.csv"
     flow = ADLSToAzureSQL(name="test", adls_path=adls_path_file)
     promoted_path = flow.get_promoted_path(env="conformed")
+    assert (
+        promoted_path
+        == "conformed/supermetrics/adls_ga_load_times_fr_test/2021-07-14T13%3A09%3A02.997357%2B00%3A00.csv"
+    )
+
+
+def test_get_promoted_adls_path_parquet_file():
+    adls_path_file = "raw/supermetrics/adls_ga_load_times_fr_test/2021-07-14T13%3A09%3A02.997357%2B00%3A00.parquet"
+    flow = ADLSToAzureSQL(name="test", adls_path=adls_path_file)
+    promoted_path = flow.get_promoted_path(env="conformed")
     assert promoted_path == "conformed/supermetrics/adls_ga_load_times_fr_test.csv"
 
 

diff --git a/tests/test_viadot.py b/tests/test_viadot.py
@@ -2,4 +2,4 @@
 
 
 def test_version():
-    assert __version__ == "0.2.7"
+    assert __version__ == "0.2.9"
diff --git a/viadot/__init__.py b/viadot/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.7"
+__version__ = "0.2.9"
diff --git a/viadot/flows/adls_to_azure_sql.py b/viadot/flows/adls_to_azure_sql.py
@@ -74,6 +74,11 @@ def df_to_csv_task(df, path: str, sep: str = "\t"):
     df.to_csv(path, sep=sep, index=False)
 
 
+@task
+def df_to_parquet_task(df, path: str):
+    df.to_parquet(path)
+
+
 class ADLSToAzureSQL(Flow):
     def __init__(
         self,
@@ -125,7 +130,6 @@ def __init__(
             vault_name (str, optional): The name of the vault from which to obtain the secrets. Defaults to None.
         """
         adls_path = adls_path.strip("/")
-
         # Read parquet
         if adls_path.split(".")[-1] in ["csv", "parquet"]:
             self.adls_path = adls_path
@@ -196,10 +200,26 @@ def get_promoted_path(self, env: str) -> str:
 
         return promoted_path
 
+    def create_to_file_task(self, df, file_type):
+        df_to_type = None
+        if file_type == "csv":
+            df_to_type = df_to_csv_task.bind(
+                df=df,
+                path=self.local_file_path,
+                sep=self.write_sep,
+                flow=self,
+            )
+        else:
+            df_to_type = df_to_parquet_task.bind(
+                df=df,
+                path=self.local_file_path,
+                flow=self,
+            )
+        return df_to_type
+
     def gen_flow(self) -> Flow:
-        adls_raw_file_path = Parameter("adls_raw_file_path", default=self.adls_path)
         df = lake_to_df_task.bind(
-            path=adls_raw_file_path,
+            path=self.adls_path,
             sp_credentials_secret=self.adls_sp_credentials_secret,
             sep=self.read_sep,
             flow=self,
@@ -217,12 +237,9 @@ def gen_flow(self) -> Flow:
         else:
             dtypes = self.dtypes
 
-        df_to_csv = df_to_csv_task.bind(
-            df=df,
-            path=self.local_file_path,
-            sep=self.write_sep,
-            flow=self,
-        )
+        adls_file_type = self.adls_path.split(".")[-1]
+        df_to_type = self.create_to_file_task(df, adls_file_type)
+
         promote_to_conformed_task.bind(
             from_path=self.local_file_path,
             to_path=self.adls_path_conformed,
@@ -257,8 +274,8 @@ def gen_flow(self) -> Flow:
         )
 
         # dtypes.set_upstream(download_json_file_task, flow=self)
-        promote_to_conformed_task.set_upstream(df_to_csv, flow=self)
+        promote_to_conformed_task.set_upstream(df_to_type, flow=self)
         # map_data_types_task.set_upstream(download_json_file_task, flow=self)
-        create_table_task.set_upstream(df_to_csv, flow=self)
+        create_table_task.set_upstream(df_to_type, flow=self)
         promote_to_operations_task.set_upstream(promote_to_conformed_task, flow=self)
         bulk_insert_task.set_upstream(create_table_task, flow=self)
diff --git a/viadot/flows/supermetrics_to_adls.py b/viadot/flows/supermetrics_to_adls.py
@@ -346,3 +346,5 @@ def gen_flow(self) -> Flow:
         file_to_adls_task.set_upstream(df_to_file, flow=self)
         json_to_adls_task.set_upstream(dtypes_to_json_task, flow=self)
         set_key_value(key=self.adls_dir_path, value=self.adls_file_path)
+
+        shutil.rmtree(self.local_json_path)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@


		def test_version():
		assert __version__ == "0.2.7"
		assert __version__ == "0.2.9"