diff --git a/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb b/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb
index 3094430..ba8d5dd 100644
--- a/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb
+++ b/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb
@@ -67,55 +67,6 @@
"spark = create_pyspark_session()"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Loads raw `Order` data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (3683040, 22)\n",
- "root\n",
- " |-- cpf: string (nullable = true)\n",
- " |-- customer_id: string (nullable = true)\n",
- " |-- customer_name: string (nullable = true)\n",
- " |-- delivery_address_city: string (nullable = true)\n",
- " |-- delivery_address_country: string (nullable = true)\n",
- " |-- delivery_address_district: string (nullable = true)\n",
- " |-- delivery_address_external_id: string (nullable = true)\n",
- " |-- delivery_address_latitude: string (nullable = true)\n",
- " |-- delivery_address_longitude: string (nullable = true)\n",
- " |-- delivery_address_state: string (nullable = true)\n",
- " |-- delivery_address_zip_code: string (nullable = true)\n",
- " |-- items: string (nullable = true)\n",
- " |-- merchant_id: string (nullable = true)\n",
- " |-- merchant_latitude: string (nullable = true)\n",
- " |-- merchant_longitude: string (nullable = true)\n",
- " |-- merchant_timezone: string (nullable = true)\n",
- " |-- order_created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- order_scheduled: boolean (nullable = true)\n",
- " |-- order_scheduled_date: timestamp (nullable = true)\n",
- " |-- order_total_amount: double (nullable = true)\n",
- " |-- origin_platform: string (nullable = true)\n",
- "\n"
- ]
- }
- ],
- "source": [
- "df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n",
- "explore_dataframe(df)"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb b/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb
deleted file mode 100644
index 1bf1515..0000000
--- a/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb
+++ /dev/null
@@ -1,1421 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "The autoreload extension is already loaded. To reload it, use:\n",
- " %reload_ext autoreload\n"
- ]
- }
- ],
- "source": [
- "%load_ext autoreload\n",
- "%autoreload 2\n",
- "from src.config import RAW_DATA_PATH\n",
- "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n",
- "from src.IOController import create_pyspark_session"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Starting PySpark session. Check your terminal for detailed logging...\n",
- "PySpark session sucessfully created.\n"
- ]
- }
- ],
- "source": [
- "spark = create_pyspark_session()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Starting processing to generate Order Items dataset...\n",
- "Exporting dataset file system...\n",
- "Dataset sucessfully exported to `/home/jovyan/data/trusted/order`!\n"
- ]
- }
- ],
- "source": [
- "tmp = create_trusted_order(spark)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "pyspark.sql.session.SparkSession"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "type(spark)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tmp.limit(3).toPandas().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (3683040, 22)\n",
- "root\n",
- " |-- cpf: string (nullable = true)\n",
- " |-- customer_id: string (nullable = true)\n",
- " |-- customer_name: string (nullable = true)\n",
- " |-- delivery_address_city: string (nullable = true)\n",
- " |-- delivery_address_country: string (nullable = true)\n",
- " |-- delivery_address_district: string (nullable = true)\n",
- " |-- delivery_address_external_id: string (nullable = true)\n",
- " |-- delivery_address_latitude: string (nullable = true)\n",
- " |-- delivery_address_longitude: string (nullable = true)\n",
- " |-- delivery_address_state: string (nullable = true)\n",
- " |-- delivery_address_zip_code: string (nullable = true)\n",
- " |-- items: string (nullable = true)\n",
- " |-- merchant_id: string (nullable = true)\n",
- " |-- merchant_latitude: string (nullable = true)\n",
- " |-- merchant_longitude: string (nullable = true)\n",
- " |-- merchant_timezone: string (nullable = true)\n",
- " |-- order_created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- order_scheduled: boolean (nullable = true)\n",
- " |-- order_scheduled_date: timestamp (nullable = true)\n",
- " |-- order_total_amount: double (nullable = true)\n",
- " |-- origin_platform: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " cpf | \n",
- " 80532101763 | \n",
- " 43352103961 | \n",
- " 38650991217 | \n",
- "
\n",
- " \n",
- " customer_id | \n",
- " 977b9a89-825f-464b-8ef6-0f453d7334c1 | \n",
- " e969cc0d-388b-4025-9351-0db0f718d81c | \n",
- " e08dcc8b-f998-405e-b3f2-7107ea8958cf | \n",
- "
\n",
- " \n",
- " customer_name | \n",
- " GUSTAVO | \n",
- " MICHELLE | \n",
- " VICTOR | \n",
- "
\n",
- " \n",
- " delivery_address_city | \n",
- " FRANCA | \n",
- " SANTOS | \n",
- " GUARULHOS | \n",
- "
\n",
- " \n",
- " delivery_address_country | \n",
- " BR | \n",
- " BR | \n",
- " BR | \n",
- "
\n",
- " \n",
- " delivery_address_district | \n",
- " JARDIM ESPRAIADO | \n",
- " CAMPO GRANDE | \n",
- " JARDIM ROSSI | \n",
- "
\n",
- " \n",
- " delivery_address_external_id | \n",
- " 6736655 | \n",
- " 8759216 | \n",
- " 8765930 | \n",
- "
\n",
- " \n",
- " delivery_address_latitude | \n",
- " -47.39 | \n",
- " -46.34 | \n",
- " -46.53 | \n",
- "
\n",
- " \n",
- " delivery_address_longitude | \n",
- " -20.55 | \n",
- " -23.96 | \n",
- " -23.44 | \n",
- "
\n",
- " \n",
- " delivery_address_state | \n",
- " SP | \n",
- " SP | \n",
- " SP | \n",
- "
\n",
- " \n",
- " delivery_address_zip_code | \n",
- " 14403 | \n",
- " 11070 | \n",
- " 71304 | \n",
- "
\n",
- " \n",
- " items | \n",
- " [{\"name\": \"Parmegiana de Filé de Frango (2 pes... | \n",
- " [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... | \n",
- " [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... | \n",
- "
\n",
- " \n",
- " merchant_id | \n",
- " eb4197f9-964c-4f87-8307-709e498aab87 | \n",
- " 927d46f9-4bb3-48f7-be1d-584deaf18adc | \n",
- " 71ad62c5-5947-4518-9846-976fbdd2f881 | \n",
- "
\n",
- " \n",
- " merchant_latitude | \n",
- " -47.39 | \n",
- " -46.34 | \n",
- " -46.53 | \n",
- "
\n",
- " \n",
- " merchant_longitude | \n",
- " -20.55 | \n",
- " -23.96 | \n",
- " -23.44 | \n",
- "
\n",
- " \n",
- " merchant_timezone | \n",
- " America/Sao_Paulo | \n",
- " America/Sao_Paulo | \n",
- " America/Sao_Paulo | \n",
- "
\n",
- " \n",
- " order_created_at | \n",
- " 2019-01-17 22:50:06 | \n",
- " 2019-01-17 17:51:26 | \n",
- " 2019-01-17 22:53:47 | \n",
- "
\n",
- " \n",
- " order_id | \n",
- " dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f | \n",
- " 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 | \n",
- " 430f9887-a563-45ee-8001-1cb29597d9dd | \n",
- "
\n",
- " \n",
- " order_scheduled | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " order_scheduled_date | \n",
- " NaT | \n",
- " NaT | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " order_total_amount | \n",
- " 46 | \n",
- " 104.5 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " origin_platform | \n",
- " ANDROID | \n",
- " ANDROID | \n",
- " IOS | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "cpf 80532101763 \n",
- "customer_id 977b9a89-825f-464b-8ef6-0f453d7334c1 \n",
- "customer_name GUSTAVO \n",
- "delivery_address_city FRANCA \n",
- "delivery_address_country BR \n",
- "delivery_address_district JARDIM ESPRAIADO \n",
- "delivery_address_external_id 6736655 \n",
- "delivery_address_latitude -47.39 \n",
- "delivery_address_longitude -20.55 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 14403 \n",
- "items [{\"name\": \"Parmegiana de Filé de Frango (2 pes... \n",
- "merchant_id eb4197f9-964c-4f87-8307-709e498aab87 \n",
- "merchant_latitude -47.39 \n",
- "merchant_longitude -20.55 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-17 22:50:06 \n",
- "order_id dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 46 \n",
- "origin_platform ANDROID \n",
- "\n",
- " 1 \\\n",
- "cpf 43352103961 \n",
- "customer_id e969cc0d-388b-4025-9351-0db0f718d81c \n",
- "customer_name MICHELLE \n",
- "delivery_address_city SANTOS \n",
- "delivery_address_country BR \n",
- "delivery_address_district CAMPO GRANDE \n",
- "delivery_address_external_id 8759216 \n",
- "delivery_address_latitude -46.34 \n",
- "delivery_address_longitude -23.96 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 11070 \n",
- "items [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... \n",
- "merchant_id 927d46f9-4bb3-48f7-be1d-584deaf18adc \n",
- "merchant_latitude -46.34 \n",
- "merchant_longitude -23.96 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-17 17:51:26 \n",
- "order_id 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 104.5 \n",
- "origin_platform ANDROID \n",
- "\n",
- " 2 \n",
- "cpf 38650991217 \n",
- "customer_id e08dcc8b-f998-405e-b3f2-7107ea8958cf \n",
- "customer_name VICTOR \n",
- "delivery_address_city GUARULHOS \n",
- "delivery_address_country BR \n",
- "delivery_address_district JARDIM ROSSI \n",
- "delivery_address_external_id 8765930 \n",
- "delivery_address_latitude -46.53 \n",
- "delivery_address_longitude -23.44 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 71304 \n",
- "items [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... \n",
- "merchant_id 71ad62c5-5947-4518-9846-976fbdd2f881 \n",
- "merchant_latitude -46.53 \n",
- "merchant_longitude -23.44 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-17 22:53:47 \n",
- "order_id 430f9887-a563-45ee-8001-1cb29597d9dd \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 35 \n",
- "origin_platform IOS "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "o_df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n",
- "explore_dataframe(o_df)\n",
- "o_df.limit(3).toPandas().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (809323, 7)\n",
- "root\n",
- " |-- customer_id: string (nullable = true)\n",
- " |-- language: string (nullable = true)\n",
- " |-- created_at: string (nullable = true)\n",
- " |-- active: string (nullable = true)\n",
- " |-- customer_name: string (nullable = true)\n",
- " |-- customer_phone_area: string (nullable = true)\n",
- " |-- customer_phone_number: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id | \n",
- " 00039466-560f-4e57-85a2-d4753cd901be | \n",
- " 001a1267-31a3-4f5b-a028-d7e323864b08 | \n",
- " 003ae1d5-67b8-4a04-b055-0e4e9622771a | \n",
- "
\n",
- " \n",
- " consumer_language | \n",
- " pt-br | \n",
- " pt-br | \n",
- " pt-br | \n",
- "
\n",
- " \n",
- " consumer_created_at | \n",
- " 2018-04-05T14:49:18.165Z | \n",
- " 2018-01-14T21:40:02.141Z | \n",
- " 2018-01-07T03:47:15.554Z | \n",
- "
\n",
- " \n",
- " consumer_active | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- "
\n",
- " \n",
- " consumer_customer_name | \n",
- " NUNO | \n",
- " ADRIELLY | \n",
- " PAULA | \n",
- "
\n",
- " \n",
- " consumer_customer_phone_area | \n",
- " 46 | \n",
- " 59 | \n",
- " 62 | \n",
- "
\n",
- " \n",
- " consumer_customer_phone_number | \n",
- " 816135924 | \n",
- " 231330577 | \n",
- " 347597883 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "customer_id 00039466-560f-4e57-85a2-d4753cd901be \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-04-05T14:49:18.165Z \n",
- "consumer_active true \n",
- "consumer_customer_name NUNO \n",
- "consumer_customer_phone_area 46 \n",
- "consumer_customer_phone_number 816135924 \n",
- "\n",
- " 1 \\\n",
- "customer_id 001a1267-31a3-4f5b-a028-d7e323864b08 \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-14T21:40:02.141Z \n",
- "consumer_active true \n",
- "consumer_customer_name ADRIELLY \n",
- "consumer_customer_phone_area 59 \n",
- "consumer_customer_phone_number 231330577 \n",
- "\n",
- " 2 \n",
- "customer_id 003ae1d5-67b8-4a04-b055-0e4e9622771a \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-07T03:47:15.554Z \n",
- "consumer_active true \n",
- "consumer_customer_name PAULA \n",
- "consumer_customer_phone_area 62 \n",
- "consumer_customer_phone_number 347597883 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "table = 'consumer'\n",
- "c_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n",
- "explore_dataframe(c_df)\n",
- "c_df = add_prefix(c_df, table)\n",
- "c_df.limit(3).toPandas().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (7292, 12)\n",
- "root\n",
- " |-- id: string (nullable = true)\n",
- " |-- created_at: string (nullable = true)\n",
- " |-- enabled: string (nullable = true)\n",
- " |-- price_range: string (nullable = true)\n",
- " |-- average_ticket: string (nullable = true)\n",
- " |-- takeout_time: string (nullable = true)\n",
- " |-- delivery_time: string (nullable = true)\n",
- " |-- minimum_order_value: string (nullable = true)\n",
- " |-- merchant_zip_code: string (nullable = true)\n",
- " |-- merchant_city: string (nullable = true)\n",
- " |-- merchant_state: string (nullable = true)\n",
- " |-- merchant_country: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " merchant_id | \n",
- " 02c94103-61f3-4906-a4a9-55611db9f28c | \n",
- " 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d | \n",
- " 33ca5d3d-b99f-404d-84d9-8df8f38a2261 | \n",
- "
\n",
- " \n",
- " restaurant_created_at | \n",
- " 2017-01-23T12:52:30.910Z | \n",
- " 2017-01-20T13:14:48.286Z | \n",
- " 2017-01-23T12:46:33.457Z | \n",
- "
\n",
- " \n",
- " restaurant_enabled | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- "
\n",
- " \n",
- " restaurant_price_range | \n",
- " 3 | \n",
- " 3 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " restaurant_average_ticket | \n",
- " 60.0 | \n",
- " 60.0 | \n",
- " 100.0 | \n",
- "
\n",
- " \n",
- " restaurant_takeout_time | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " restaurant_delivery_time | \n",
- " 50 | \n",
- " 0 | \n",
- " 45 | \n",
- "
\n",
- " \n",
- " restaurant_minimum_order_value | \n",
- " 30.0 | \n",
- " 30.0 | \n",
- " 10.0 | \n",
- "
\n",
- " \n",
- " restaurant_merchant_zip_code | \n",
- " 14025 | \n",
- " 50180 | \n",
- " 23090 | \n",
- "
\n",
- " \n",
- " restaurant_merchant_city | \n",
- " RIBEIRAO PRETO | \n",
- " SAO PAULO | \n",
- " RIO DE JANEIRO | \n",
- "
\n",
- " \n",
- " restaurant_merchant_state | \n",
- " SP | \n",
- " SP | \n",
- " RJ | \n",
- "
\n",
- " \n",
- " restaurant_merchant_country | \n",
- " BR | \n",
- " BR | \n",
- " BR | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "merchant_id 02c94103-61f3-4906-a4a9-55611db9f28c \n",
- "restaurant_created_at 2017-01-23T12:52:30.910Z \n",
- "restaurant_enabled false \n",
- "restaurant_price_range 3 \n",
- "restaurant_average_ticket 60.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 50 \n",
- "restaurant_minimum_order_value 30.0 \n",
- "restaurant_merchant_zip_code 14025 \n",
- "restaurant_merchant_city RIBEIRAO PRETO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "\n",
- " 1 \\\n",
- "merchant_id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n",
- "restaurant_created_at 2017-01-20T13:14:48.286Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 3 \n",
- "restaurant_average_ticket 60.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 0 \n",
- "restaurant_minimum_order_value 30.0 \n",
- "restaurant_merchant_zip_code 50180 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "\n",
- " 2 \n",
- "merchant_id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n",
- "restaurant_created_at 2017-01-23T12:46:33.457Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 5 \n",
- "restaurant_average_ticket 100.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 45 \n",
- "restaurant_minimum_order_value 10.0 \n",
- "restaurant_merchant_zip_code 23090 \n",
- "restaurant_merchant_city RIO DE JANEIRO \n",
- "restaurant_merchant_state RJ \n",
- "restaurant_merchant_country BR "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "table = 'restaurant'\n",
- "r_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n",
- "explore_dataframe(r_df)\n",
- "r_df = r_df.withColumnRenamed('id', 'merchant_id')\n",
- "r_df = add_prefix(r_df, table)\n",
- "r_df.limit(3).toPandas().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (11075048, 4)\n",
- "root\n",
- " |-- created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- status_id: string (nullable = true)\n",
- " |-- value: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " status_created_at | \n",
- " order_id | \n",
- " status_id | \n",
- " status_value | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2019-01-25 01:05:07 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " b4298862-fa38-499a-93e2-a76930fb2bce | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2019-01-24 23:04:27 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " 7964bf63-007a-484d-a321-e9118ccc2f97 | \n",
- " REGISTERED | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2019-01-24 23:04:28 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " ca16b92b-db8f-4274-b165-929675541a9f | \n",
- " PLACED | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " status_created_at order_id \\\n",
- "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "\n",
- " status_id status_value \n",
- "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n",
- "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n",
- "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "table = 'status'\n",
- "s_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n",
- "explore_dataframe(s_df)\n",
- "s_df = add_prefix(s_df, table)\n",
- "s_df.limit(3).toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (2441067, 4)\n",
- "root\n",
- " |-- status_created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- status_id: string (nullable = true)\n",
- " |-- status_value: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " status_created_at | \n",
- " order_id | \n",
- " status_id | \n",
- " status_value | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2019-01-25 01:05:07 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " b4298862-fa38-499a-93e2-a76930fb2bce | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2019-01-03 20:15:06 | \n",
- " 0012d95c-9c4b-4244-86b5-dcf87677dcc1 | \n",
- " c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2019-01-06 16:20:27 | \n",
- " 0013fc5c-4c10-4402-886c-1b8166e4632e | \n",
- " d0a3ffd5-4e48-4cc4-9739-d5764678c19f | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2019-01-03 15:55:12 | \n",
- " 00251da6-aa45-4512-be58-6622a248cdff | \n",
- " a621de13-5272-4c7c-969c-3bcd53f0515f | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2019-01-21 22:20:02 | \n",
- " 00273652-efc8-4e7c-95b2-cd3827900e7e | \n",
- " 74f8ff33-e731-477f-a698-4755577b80a1 | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " status_created_at order_id \\\n",
- "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
- "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n",
- "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n",
- "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n",
- "\n",
- " status_id status_value \n",
- "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n",
- "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n",
- "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n",
- "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n",
- "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "s_df = extract_latest_values(df=s_df, id_col='order_id', dt_col='status_created_at')\n",
- "explore_dataframe(s_df)\n",
- "s_df.limit(5).toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+------------+-------+\n",
- "|status_value| count|\n",
- "+------------+-------+\n",
- "| CONCLUDED|2354218|\n",
- "| CANCELLED| 55179|\n",
- "| PLACED| 31654|\n",
- "| REGISTERED| 16|\n",
- "+------------+-------+\n",
- "\n"
- ]
- }
- ],
- "source": [
- "s_df.groupBy('status_value').count().sort('count', ascending=False).show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " order_id | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " 0012d95c-9c4b-4244-86b5-dcf87677dcc1 | \n",
- " 0012d95c-9c4b-4244-86b5-dcf87677dcc1 | \n",
- "
\n",
- " \n",
- " merchant_id | \n",
- " e3d24e4d-2f51-4987-8c39-47923b20b9be | \n",
- " e66b0dcc-ffa9-42ee-a864-60977672c3ef | \n",
- " e66b0dcc-ffa9-42ee-a864-60977672c3ef | \n",
- "
\n",
- " \n",
- " customer_id | \n",
- " 97c53c25-bd9b-41cb-8a0d-13cd74509f17 | \n",
- " e2213649-cede-4770-b6e0-7ac1dd4d3548 | \n",
- " e2213649-cede-4770-b6e0-7ac1dd4d3548 | \n",
- "
\n",
- " \n",
- " cpf | \n",
- " 16854185492 | \n",
- " 05264105611 | \n",
- " 78163676650 | \n",
- "
\n",
- " \n",
- " customer_name | \n",
- " EDUARDO | \n",
- " BRUNO | \n",
- " BRUNO | \n",
- "
\n",
- " \n",
- " delivery_address_city | \n",
- " SAO PAULO | \n",
- " SAO PAULO | \n",
- " SAO PAULO | \n",
- "
\n",
- " \n",
- " delivery_address_country | \n",
- " BR | \n",
- " BR | \n",
- " BR | \n",
- "
\n",
- " \n",
- " delivery_address_district | \n",
- " ITAIM BIBI | \n",
- " PENHA DE FRANCA | \n",
- " PENHA DE FRANCA | \n",
- "
\n",
- " \n",
- " delivery_address_external_id | \n",
- " 8847122 | \n",
- " 7632090 | \n",
- " 7632090 | \n",
- "
\n",
- " \n",
- " delivery_address_latitude | \n",
- " -46.68 | \n",
- " -46.54 | \n",
- " -46.54 | \n",
- "
\n",
- " \n",
- " delivery_address_longitude | \n",
- " -23.59 | \n",
- " -23.52 | \n",
- " -23.52 | \n",
- "
\n",
- " \n",
- " delivery_address_state | \n",
- " SP | \n",
- " SP | \n",
- " SP | \n",
- "
\n",
- " \n",
- " delivery_address_zip_code | \n",
- " 45381 | \n",
- " 36100 | \n",
- " 36100 | \n",
- "
\n",
- " \n",
- " items | \n",
- " [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... | \n",
- " [{\"name\": \"Porção Batata Frita\", \"addition\": {... | \n",
- " [{\"name\": \"Porção Batata Frita\", \"addition\": {... | \n",
- "
\n",
- " \n",
- " merchant_latitude | \n",
- " -46.68 | \n",
- " -46.54 | \n",
- " -46.54 | \n",
- "
\n",
- " \n",
- " merchant_longitude | \n",
- " -23.59 | \n",
- " -23.52 | \n",
- " -23.52 | \n",
- "
\n",
- " \n",
- " merchant_timezone | \n",
- " America/Sao_Paulo | \n",
- " America/Sao_Paulo | \n",
- " America/Sao_Paulo | \n",
- "
\n",
- " \n",
- " order_created_at | \n",
- " 2019-01-24 23:04:27 | \n",
- " 2019-01-03 18:12:24 | \n",
- " 2018-12-04 18:12:24 | \n",
- "
\n",
- " \n",
- " order_scheduled | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " order_scheduled_date | \n",
- " NaT | \n",
- " NaT | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " order_total_amount | \n",
- " 27 | \n",
- " 17.5 | \n",
- " 17.5 | \n",
- "
\n",
- " \n",
- " origin_platform | \n",
- " ANDROID | \n",
- " ANDROID | \n",
- " ANDROID | \n",
- "
\n",
- " \n",
- " consumer_language | \n",
- " pt-br | \n",
- " pt-br | \n",
- " pt-br | \n",
- "
\n",
- " \n",
- " consumer_created_at | \n",
- " 2018-04-05T13:20:39.644Z | \n",
- " 2018-01-06T14:31:43.348Z | \n",
- " 2018-01-06T14:31:43.348Z | \n",
- "
\n",
- " \n",
- " consumer_active | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- "
\n",
- " \n",
- " consumer_customer_name | \n",
- " EDUARDO | \n",
- " BRUNO | \n",
- " BRUNO | \n",
- "
\n",
- " \n",
- " consumer_customer_phone_area | \n",
- " 83 | \n",
- " 60 | \n",
- " 60 | \n",
- "
\n",
- " \n",
- " consumer_customer_phone_number | \n",
- " 020082840 | \n",
- " 109441873 | \n",
- " 109441873 | \n",
- "
\n",
- " \n",
- " restaurant_created_at | \n",
- " 2017-01-20T13:14:41.451Z | \n",
- " 2017-01-20T13:14:16.179Z | \n",
- " 2017-01-20T13:14:16.179Z | \n",
- "
\n",
- " \n",
- " restaurant_enabled | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- "
\n",
- " \n",
- " restaurant_price_range | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " restaurant_average_ticket | \n",
- " 30.0 | \n",
- " 30.0 | \n",
- " 30.0 | \n",
- "
\n",
- " \n",
- " restaurant_takeout_time | \n",
- " 0 | \n",
- " 20 | \n",
- " 20 | \n",
- "
\n",
- " \n",
- " restaurant_delivery_time | \n",
- " 40 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " restaurant_minimum_order_value | \n",
- " 30.0 | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " restaurant_merchant_zip_code | \n",
- " 56560 | \n",
- " 36350 | \n",
- " 36350 | \n",
- "
\n",
- " \n",
- " restaurant_merchant_city | \n",
- " SAO PAULO | \n",
- " SAO PAULO | \n",
- " SAO PAULO | \n",
- "
\n",
- " \n",
- " restaurant_merchant_state | \n",
- " SP | \n",
- " SP | \n",
- " SP | \n",
- "
\n",
- " \n",
- " restaurant_merchant_country | \n",
- " BR | \n",
- " BR | \n",
- " BR | \n",
- "
\n",
- " \n",
- " status_created_at | \n",
- " 2019-01-25 01:05:07 | \n",
- " 2019-01-03 20:15:06 | \n",
- " 2019-01-03 20:15:06 | \n",
- "
\n",
- " \n",
- " status_id | \n",
- " b4298862-fa38-499a-93e2-a76930fb2bce | \n",
- " c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 | \n",
- " c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 | \n",
- "
\n",
- " \n",
- " status_value | \n",
- " CONCLUDED | \n",
- " CONCLUDED | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n",
- "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n",
- "cpf 16854185492 \n",
- "customer_name EDUARDO \n",
- "delivery_address_city SAO PAULO \n",
- "delivery_address_country BR \n",
- "delivery_address_district ITAIM BIBI \n",
- "delivery_address_external_id 8847122 \n",
- "delivery_address_latitude -46.68 \n",
- "delivery_address_longitude -23.59 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 45381 \n",
- "items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n",
- "merchant_latitude -46.68 \n",
- "merchant_longitude -23.59 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-24 23:04:27 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 27 \n",
- "origin_platform ANDROID \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-04-05T13:20:39.644Z \n",
- "consumer_active true \n",
- "consumer_customer_name EDUARDO \n",
- "consumer_customer_phone_area 83 \n",
- "consumer_customer_phone_number 020082840 \n",
- "restaurant_created_at 2017-01-20T13:14:41.451Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 1 \n",
- "restaurant_average_ticket 30.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 40 \n",
- "restaurant_minimum_order_value 30.0 \n",
- "restaurant_merchant_zip_code 56560 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "status_created_at 2019-01-25 01:05:07 \n",
- "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n",
- "status_value CONCLUDED \n",
- "\n",
- " 1 \\\n",
- "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
- "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n",
- "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n",
- "cpf 05264105611 \n",
- "customer_name BRUNO \n",
- "delivery_address_city SAO PAULO \n",
- "delivery_address_country BR \n",
- "delivery_address_district PENHA DE FRANCA \n",
- "delivery_address_external_id 7632090 \n",
- "delivery_address_latitude -46.54 \n",
- "delivery_address_longitude -23.52 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 36100 \n",
- "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n",
- "merchant_latitude -46.54 \n",
- "merchant_longitude -23.52 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-03 18:12:24 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 17.5 \n",
- "origin_platform ANDROID \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-06T14:31:43.348Z \n",
- "consumer_active true \n",
- "consumer_customer_name BRUNO \n",
- "consumer_customer_phone_area 60 \n",
- "consumer_customer_phone_number 109441873 \n",
- "restaurant_created_at 2017-01-20T13:14:16.179Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 1 \n",
- "restaurant_average_ticket 30.0 \n",
- "restaurant_takeout_time 20 \n",
- "restaurant_delivery_time 0 \n",
- "restaurant_minimum_order_value 0.0 \n",
- "restaurant_merchant_zip_code 36350 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "status_created_at 2019-01-03 20:15:06 \n",
- "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n",
- "status_value CONCLUDED \n",
- "\n",
- " 2 \n",
- "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
- "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n",
- "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n",
- "cpf 78163676650 \n",
- "customer_name BRUNO \n",
- "delivery_address_city SAO PAULO \n",
- "delivery_address_country BR \n",
- "delivery_address_district PENHA DE FRANCA \n",
- "delivery_address_external_id 7632090 \n",
- "delivery_address_latitude -46.54 \n",
- "delivery_address_longitude -23.52 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 36100 \n",
- "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n",
- "merchant_latitude -46.54 \n",
- "merchant_longitude -23.52 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2018-12-04 18:12:24 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 17.5 \n",
- "origin_platform ANDROID \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-06T14:31:43.348Z \n",
- "consumer_active true \n",
- "consumer_customer_name BRUNO \n",
- "consumer_customer_phone_area 60 \n",
- "consumer_customer_phone_number 109441873 \n",
- "restaurant_created_at 2017-01-20T13:14:16.179Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 1 \n",
- "restaurant_average_ticket 30.0 \n",
- "restaurant_takeout_time 20 \n",
- "restaurant_delivery_time 0 \n",
- "restaurant_minimum_order_value 0.0 \n",
- "restaurant_merchant_zip_code 36350 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "status_created_at 2019-01-03 20:15:06 \n",
- "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n",
- "status_value CONCLUDED "
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tmp = (o_df\n",
- " .join(c_df, on='customer_id', how='left')\n",
- " .join(r_df, on='merchant_id', how='left')\n",
- " .join(s_df, on='order_id', how='left')\n",
- " )\n",
- "tmp.limit(3).toPandas().T"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Fix schema and anonymize sensitive data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# for dtype, cols in dtypes.items():\n",
- "# for col in cols:\n",
- "# df = df.withColumn(col, df[col].cast(dtype))"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.5"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb b/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb
index 31255a6..0d3bbc4 100644
--- a/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb
+++ b/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb
@@ -1,879 +1,112 @@
{
"cells": [
{
- "cell_type": "code",
- "execution_count": 1,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "%load_ext autoreload\n",
- "%autoreload 2\n",
- "from src.config import RAW_DATA_PATH\n",
- "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n",
- "from src.IOController import create_pyspark_session"
+ "# Truted Data Generation - Order\n",
+ "\n",
+ "## Scope of notebook\n",
+ "\n",
+ "> Create `Order` dataset following the requirements below.\n",
+ "\n",
+ "* Order dataset - one line per order with all data from order, consumer, restaurant and the LAST status from order statuses dataset. To help analysis, it would be a nice to have: data partitioned on the restaurant LOCAL date."
]
},
{
- "cell_type": "code",
- "execution_count": 2,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Starting PySpark session. Check your terminal for detailed logging...\n",
- "PySpark session sucessfully created.\n"
- ]
- }
- ],
"source": [
- "spark = create_pyspark_session()"
+ "Loads libraries"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(#rows, #columns) = (3683040, 22)\n",
- "root\n",
- " |-- cpf: string (nullable = true)\n",
- " |-- customer_id: string (nullable = true)\n",
- " |-- customer_name: string (nullable = true)\n",
- " |-- delivery_address_city: string (nullable = true)\n",
- " |-- delivery_address_country: string (nullable = true)\n",
- " |-- delivery_address_district: string (nullable = true)\n",
- " |-- delivery_address_external_id: string (nullable = true)\n",
- " |-- delivery_address_latitude: string (nullable = true)\n",
- " |-- delivery_address_longitude: string (nullable = true)\n",
- " |-- delivery_address_state: string (nullable = true)\n",
- " |-- delivery_address_zip_code: string (nullable = true)\n",
- " |-- items: string (nullable = true)\n",
- " |-- merchant_id: string (nullable = true)\n",
- " |-- merchant_latitude: string (nullable = true)\n",
- " |-- merchant_longitude: string (nullable = true)\n",
- " |-- merchant_timezone: string (nullable = true)\n",
- " |-- order_created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- order_scheduled: boolean (nullable = true)\n",
- " |-- order_scheduled_date: timestamp (nullable = true)\n",
- " |-- order_total_amount: double (nullable = true)\n",
- " |-- origin_platform: string (nullable = true)\n",
- "\n"
+ "The autoreload extension is already loaded. To reload it, use:\n",
+ " %reload_ext autoreload\n"
]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " cpf | \n",
- " 80532101763 | \n",
- " 43352103961 | \n",
- " 38650991217 | \n",
- "
\n",
- " \n",
- " customer_id | \n",
- " 977b9a89-825f-464b-8ef6-0f453d7334c1 | \n",
- " e969cc0d-388b-4025-9351-0db0f718d81c | \n",
- " e08dcc8b-f998-405e-b3f2-7107ea8958cf | \n",
- "
\n",
- " \n",
- " customer_name | \n",
- " GUSTAVO | \n",
- " MICHELLE | \n",
- " VICTOR | \n",
- "
\n",
- " \n",
- " delivery_address_city | \n",
- " FRANCA | \n",
- " SANTOS | \n",
- " GUARULHOS | \n",
- "
\n",
- " \n",
- " delivery_address_country | \n",
- " BR | \n",
- " BR | \n",
- " BR | \n",
- "
\n",
- " \n",
- " delivery_address_district | \n",
- " JARDIM ESPRAIADO | \n",
- " CAMPO GRANDE | \n",
- " JARDIM ROSSI | \n",
- "
\n",
- " \n",
- " delivery_address_external_id | \n",
- " 6736655 | \n",
- " 8759216 | \n",
- " 8765930 | \n",
- "
\n",
- " \n",
- " delivery_address_latitude | \n",
- " -47.39 | \n",
- " -46.34 | \n",
- " -46.53 | \n",
- "
\n",
- " \n",
- " delivery_address_longitude | \n",
- " -20.55 | \n",
- " -23.96 | \n",
- " -23.44 | \n",
- "
\n",
- " \n",
- " delivery_address_state | \n",
- " SP | \n",
- " SP | \n",
- " SP | \n",
- "
\n",
- " \n",
- " delivery_address_zip_code | \n",
- " 14403 | \n",
- " 11070 | \n",
- " 71304 | \n",
- "
\n",
- " \n",
- " items | \n",
- " [{\"name\": \"Parmegiana de Filé de Frango (2 pes... | \n",
- " [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... | \n",
- " [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... | \n",
- "
\n",
- " \n",
- " merchant_id | \n",
- " eb4197f9-964c-4f87-8307-709e498aab87 | \n",
- " 927d46f9-4bb3-48f7-be1d-584deaf18adc | \n",
- " 71ad62c5-5947-4518-9846-976fbdd2f881 | \n",
- "
\n",
- " \n",
- " merchant_latitude | \n",
- " -47.39 | \n",
- " -46.34 | \n",
- " -46.53 | \n",
- "
\n",
- " \n",
- " merchant_longitude | \n",
- " -20.55 | \n",
- " -23.96 | \n",
- " -23.44 | \n",
- "
\n",
- " \n",
- " merchant_timezone | \n",
- " America/Sao_Paulo | \n",
- " America/Sao_Paulo | \n",
- " America/Sao_Paulo | \n",
- "
\n",
- " \n",
- " order_created_at | \n",
- " 2019-01-17 22:50:06 | \n",
- " 2019-01-17 17:51:26 | \n",
- " 2019-01-17 22:53:47 | \n",
- "
\n",
- " \n",
- " order_id | \n",
- " dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f | \n",
- " 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 | \n",
- " 430f9887-a563-45ee-8001-1cb29597d9dd | \n",
- "
\n",
- " \n",
- " order_scheduled | \n",
- " False | \n",
- " False | \n",
- " False | \n",
- "
\n",
- " \n",
- " order_scheduled_date | \n",
- " NaT | \n",
- " NaT | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " order_total_amount | \n",
- " 46 | \n",
- " 104.5 | \n",
- " 35 | \n",
- "
\n",
- " \n",
- " origin_platform | \n",
- " ANDROID | \n",
- " ANDROID | \n",
- " IOS | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "cpf 80532101763 \n",
- "customer_id 977b9a89-825f-464b-8ef6-0f453d7334c1 \n",
- "customer_name GUSTAVO \n",
- "delivery_address_city FRANCA \n",
- "delivery_address_country BR \n",
- "delivery_address_district JARDIM ESPRAIADO \n",
- "delivery_address_external_id 6736655 \n",
- "delivery_address_latitude -47.39 \n",
- "delivery_address_longitude -20.55 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 14403 \n",
- "items [{\"name\": \"Parmegiana de Filé de Frango (2 pes... \n",
- "merchant_id eb4197f9-964c-4f87-8307-709e498aab87 \n",
- "merchant_latitude -47.39 \n",
- "merchant_longitude -20.55 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-17 22:50:06 \n",
- "order_id dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 46 \n",
- "origin_platform ANDROID \n",
- "\n",
- " 1 \\\n",
- "cpf 43352103961 \n",
- "customer_id e969cc0d-388b-4025-9351-0db0f718d81c \n",
- "customer_name MICHELLE \n",
- "delivery_address_city SANTOS \n",
- "delivery_address_country BR \n",
- "delivery_address_district CAMPO GRANDE \n",
- "delivery_address_external_id 8759216 \n",
- "delivery_address_latitude -46.34 \n",
- "delivery_address_longitude -23.96 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 11070 \n",
- "items [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... \n",
- "merchant_id 927d46f9-4bb3-48f7-be1d-584deaf18adc \n",
- "merchant_latitude -46.34 \n",
- "merchant_longitude -23.96 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-17 17:51:26 \n",
- "order_id 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 104.5 \n",
- "origin_platform ANDROID \n",
- "\n",
- " 2 \n",
- "cpf 38650991217 \n",
- "customer_id e08dcc8b-f998-405e-b3f2-7107ea8958cf \n",
- "customer_name VICTOR \n",
- "delivery_address_city GUARULHOS \n",
- "delivery_address_country BR \n",
- "delivery_address_district JARDIM ROSSI \n",
- "delivery_address_external_id 8765930 \n",
- "delivery_address_latitude -46.53 \n",
- "delivery_address_longitude -23.44 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 71304 \n",
- "items [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... \n",
- "merchant_id 71ad62c5-5947-4518-9846-976fbdd2f881 \n",
- "merchant_latitude -46.53 \n",
- "merchant_longitude -23.44 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-17 22:53:47 \n",
- "order_id 430f9887-a563-45ee-8001-1cb29597d9dd \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 35 \n",
- "origin_platform IOS "
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
}
],
"source": [
- "o_df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n",
- "explore_dataframe(o_df)\n",
- "o_df.limit(3).toPandas().T"
+ "%load_ext autoreload\n",
+ "%autoreload 2\n",
+ "from src.config import RAW_DATA_PATH\n",
+ "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n",
+ "from src.IOController import create_pyspark_session"
]
},
{
- "cell_type": "code",
- "execution_count": 4,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (809323, 7)\n",
- "root\n",
- " |-- customer_id: string (nullable = true)\n",
- " |-- language: string (nullable = true)\n",
- " |-- created_at: string (nullable = true)\n",
- " |-- active: string (nullable = true)\n",
- " |-- customer_name: string (nullable = true)\n",
- " |-- customer_phone_area: string (nullable = true)\n",
- " |-- customer_phone_number: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id | \n",
- " 00039466-560f-4e57-85a2-d4753cd901be | \n",
- " 001a1267-31a3-4f5b-a028-d7e323864b08 | \n",
- " 003ae1d5-67b8-4a04-b055-0e4e9622771a | \n",
- "
\n",
- " \n",
- " consumer_language | \n",
- " pt-br | \n",
- " pt-br | \n",
- " pt-br | \n",
- "
\n",
- " \n",
- " consumer_created_at | \n",
- " 2018-04-05T14:49:18.165Z | \n",
- " 2018-01-14T21:40:02.141Z | \n",
- " 2018-01-07T03:47:15.554Z | \n",
- "
\n",
- " \n",
- " consumer_active | \n",
- " true | \n",
- " true | \n",
- " true | \n",
- "
\n",
- " \n",
- " consumer_customer_name | \n",
- " NUNO | \n",
- " ADRIELLY | \n",
- " PAULA | \n",
- "
\n",
- " \n",
- " consumer_customer_phone_area | \n",
- " 46 | \n",
- " 59 | \n",
- " 62 | \n",
- "
\n",
- " \n",
- " consumer_customer_phone_number | \n",
- " 816135924 | \n",
- " 231330577 | \n",
- " 347597883 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "customer_id 00039466-560f-4e57-85a2-d4753cd901be \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-04-05T14:49:18.165Z \n",
- "consumer_active true \n",
- "consumer_customer_name NUNO \n",
- "consumer_customer_phone_area 46 \n",
- "consumer_customer_phone_number 816135924 \n",
- "\n",
- " 1 \\\n",
- "customer_id 001a1267-31a3-4f5b-a028-d7e323864b08 \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-14T21:40:02.141Z \n",
- "consumer_active true \n",
- "consumer_customer_name ADRIELLY \n",
- "consumer_customer_phone_area 59 \n",
- "consumer_customer_phone_number 231330577 \n",
- "\n",
- " 2 \n",
- "customer_id 003ae1d5-67b8-4a04-b055-0e4e9622771a \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-07T03:47:15.554Z \n",
- "consumer_active true \n",
- "consumer_customer_name PAULA \n",
- "consumer_customer_phone_area 62 \n",
- "consumer_customer_phone_number 347597883 "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "table = 'consumer'\n",
- "c_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n",
- "explore_dataframe(c_df)\n",
- "c_df = add_prefix(c_df, table)\n",
- "c_df.limit(3).toPandas().T"
+ "Starts SparkSession."
]
},
{
"cell_type": "code",
- "execution_count": 5,
- "metadata": {},
+ "execution_count": 32,
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(#rows, #columns) = (7292, 12)\n",
- "root\n",
- " |-- id: string (nullable = true)\n",
- " |-- created_at: string (nullable = true)\n",
- " |-- enabled: string (nullable = true)\n",
- " |-- price_range: string (nullable = true)\n",
- " |-- average_ticket: string (nullable = true)\n",
- " |-- takeout_time: string (nullable = true)\n",
- " |-- delivery_time: string (nullable = true)\n",
- " |-- minimum_order_value: string (nullable = true)\n",
- " |-- merchant_zip_code: string (nullable = true)\n",
- " |-- merchant_city: string (nullable = true)\n",
- " |-- merchant_state: string (nullable = true)\n",
- " |-- merchant_country: string (nullable = true)\n",
- "\n"
+ "Starting PySpark session. Check your terminal for detailed logging...\n",
+ "PySpark session sucessfully created.\n"
]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " merchant_id | \n",
- " 02c94103-61f3-4906-a4a9-55611db9f28c | \n",
- " 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d | \n",
- " 33ca5d3d-b99f-404d-84d9-8df8f38a2261 | \n",
- "
\n",
- " \n",
- " restaurant_created_at | \n",
- " 2017-01-23T12:52:30.910Z | \n",
- " 2017-01-20T13:14:48.286Z | \n",
- " 2017-01-23T12:46:33.457Z | \n",
- "
\n",
- " \n",
- " restaurant_enabled | \n",
- " false | \n",
- " true | \n",
- " true | \n",
- "
\n",
- " \n",
- " restaurant_price_range | \n",
- " 3 | \n",
- " 3 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- " restaurant_average_ticket | \n",
- " 60.0 | \n",
- " 60.0 | \n",
- " 100.0 | \n",
- "
\n",
- " \n",
- " restaurant_takeout_time | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " restaurant_delivery_time | \n",
- " 50 | \n",
- " 0 | \n",
- " 45 | \n",
- "
\n",
- " \n",
- " restaurant_minimum_order_value | \n",
- " 30.0 | \n",
- " 30.0 | \n",
- " 10.0 | \n",
- "
\n",
- " \n",
- " restaurant_merchant_zip_code | \n",
- " 14025 | \n",
- " 50180 | \n",
- " 23090 | \n",
- "
\n",
- " \n",
- " restaurant_merchant_city | \n",
- " RIBEIRAO PRETO | \n",
- " SAO PAULO | \n",
- " RIO DE JANEIRO | \n",
- "
\n",
- " \n",
- " restaurant_merchant_state | \n",
- " SP | \n",
- " SP | \n",
- " RJ | \n",
- "
\n",
- " \n",
- " restaurant_merchant_country | \n",
- " BR | \n",
- " BR | \n",
- " BR | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0 \\\n",
- "merchant_id 02c94103-61f3-4906-a4a9-55611db9f28c \n",
- "restaurant_created_at 2017-01-23T12:52:30.910Z \n",
- "restaurant_enabled false \n",
- "restaurant_price_range 3 \n",
- "restaurant_average_ticket 60.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 50 \n",
- "restaurant_minimum_order_value 30.0 \n",
- "restaurant_merchant_zip_code 14025 \n",
- "restaurant_merchant_city RIBEIRAO PRETO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "\n",
- " 1 \\\n",
- "merchant_id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n",
- "restaurant_created_at 2017-01-20T13:14:48.286Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 3 \n",
- "restaurant_average_ticket 60.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 0 \n",
- "restaurant_minimum_order_value 30.0 \n",
- "restaurant_merchant_zip_code 50180 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "\n",
- " 2 \n",
- "merchant_id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n",
- "restaurant_created_at 2017-01-23T12:46:33.457Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 5 \n",
- "restaurant_average_ticket 100.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 45 \n",
- "restaurant_minimum_order_value 10.0 \n",
- "restaurant_merchant_zip_code 23090 \n",
- "restaurant_merchant_city RIO DE JANEIRO \n",
- "restaurant_merchant_state RJ \n",
- "restaurant_merchant_country BR "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
}
],
"source": [
- "table = 'restaurant'\n",
- "r_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n",
- "explore_dataframe(r_df)\n",
- "r_df = r_df.withColumnRenamed('id', 'merchant_id')\n",
- "r_df = add_prefix(r_df, table)\n",
- "r_df.limit(3).toPandas().T"
+ "spark = create_pyspark_session()"
]
},
{
- "cell_type": "code",
- "execution_count": 6,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(#rows, #columns) = (11075048, 4)\n",
- "root\n",
- " |-- created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- status_id: string (nullable = true)\n",
- " |-- value: string (nullable = true)\n",
- "\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " status_created_at | \n",
- " order_id | \n",
- " status_id | \n",
- " status_value | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2019-01-25 01:05:07 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " b4298862-fa38-499a-93e2-a76930fb2bce | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2019-01-24 23:04:27 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " 7964bf63-007a-484d-a321-e9118ccc2f97 | \n",
- " REGISTERED | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2019-01-24 23:04:28 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " ca16b92b-db8f-4274-b165-929675541a9f | \n",
- " PLACED | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " status_created_at order_id \\\n",
- "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "\n",
- " status_id status_value \n",
- "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n",
- "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n",
- "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "table = 'status'\n",
- "s_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n",
- "explore_dataframe(s_df)\n",
- "s_df = add_prefix(s_df, table)\n",
- "s_df.limit(3).toPandas()"
+ "Generates Order dataset for trusted layer."
]
},
{
"cell_type": "code",
- "execution_count": 7,
- "metadata": {},
+ "execution_count": 33,
+ "metadata": {
+ "scrolled": true
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(#rows, #columns) = (2441067, 4)\n",
- "root\n",
- " |-- status_created_at: timestamp (nullable = true)\n",
- " |-- order_id: string (nullable = true)\n",
- " |-- status_id: string (nullable = true)\n",
- " |-- status_value: string (nullable = true)\n",
- "\n"
+ "Starting processing to generate Order Items dataset...\n",
+ "Exporting dataset file system...\n",
+ "Dataset sucessfully exported to `/home/jovyan/data/trusted/order`!\n"
]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " status_created_at | \n",
- " order_id | \n",
- " status_id | \n",
- " status_value | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2019-01-25 01:05:07 | \n",
- " 0002fe02-d7dc-4232-b7ac-3394019ce240 | \n",
- " b4298862-fa38-499a-93e2-a76930fb2bce | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2019-01-03 20:15:06 | \n",
- " 0012d95c-9c4b-4244-86b5-dcf87677dcc1 | \n",
- " c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2019-01-06 16:20:27 | \n",
- " 0013fc5c-4c10-4402-886c-1b8166e4632e | \n",
- " d0a3ffd5-4e48-4cc4-9739-d5764678c19f | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2019-01-03 15:55:12 | \n",
- " 00251da6-aa45-4512-be58-6622a248cdff | \n",
- " a621de13-5272-4c7c-969c-3bcd53f0515f | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2019-01-21 22:20:02 | \n",
- " 00273652-efc8-4e7c-95b2-cd3827900e7e | \n",
- " 74f8ff33-e731-477f-a698-4755577b80a1 | \n",
- " CONCLUDED | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " status_created_at order_id \\\n",
- "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
- "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n",
- "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n",
- "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n",
- "\n",
- " status_id status_value \n",
- "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n",
- "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n",
- "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n",
- "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n",
- "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
}
],
"source": [
- "s_df = extract_latest_values(df=s_df, id_col='order_id', dt_col='status_created_at')\n",
- "explore_dataframe(s_df)\n",
- "s_df.limit(5).toPandas()"
+ "tmp = create_trusted_order(spark)"
]
},
{
- "cell_type": "code",
- "execution_count": 8,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "+------------+-------+\n",
- "|status_value| count|\n",
- "+------------+-------+\n",
- "| CONCLUDED|2354218|\n",
- "| CANCELLED| 55179|\n",
- "| PLACED| 31654|\n",
- "| REGISTERED| 16|\n",
- "+------------+-------+\n",
- "\n"
- ]
- }
- ],
"source": [
- "s_df.groupBy('status_value').count().sort('count', ascending=False).show()"
+ "Shows 3 rows."
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 35,
"metadata": {},
"outputs": [
{
@@ -922,31 +155,19 @@
" e2213649-cede-4770-b6e0-7ac1dd4d3548 | \n",
" \n",
" \n",
- " cpf | \n",
- " 16854185492 | \n",
- " 05264105611 | \n",
- " 78163676650 | \n",
- "
\n",
- " \n",
- " customer_name | \n",
- " EDUARDO | \n",
- " BRUNO | \n",
- " BRUNO | \n",
- "
\n",
- " \n",
- " delivery_address_city | \n",
+ " order_delivery_address_city | \n",
" SAO PAULO | \n",
" SAO PAULO | \n",
" SAO PAULO | \n",
"
\n",
" \n",
- " delivery_address_country | \n",
+ " order_delivery_address_country | \n",
" BR | \n",
" BR | \n",
" BR | \n",
"
\n",
" \n",
- " delivery_address_district | \n",
+ " order_delivery_address_district | \n",
" ITAIM BIBI | \n",
" PENHA DE FRANCA | \n",
" PENHA DE FRANCA | \n",
@@ -958,79 +179,79 @@
" 7632090 | \n",
"
\n",
" \n",
- " delivery_address_latitude | \n",
+ " order_delivery_address_latitude | \n",
" -46.68 | \n",
" -46.54 | \n",
" -46.54 | \n",
"
\n",
" \n",
- " delivery_address_longitude | \n",
+ " order_delivery_address_longitude | \n",
" -23.59 | \n",
" -23.52 | \n",
" -23.52 | \n",
"
\n",
" \n",
- " delivery_address_state | \n",
+ " order_delivery_address_state | \n",
" SP | \n",
" SP | \n",
" SP | \n",
"
\n",
" \n",
- " delivery_address_zip_code | \n",
+ " order_delivery_address_zip_code | \n",
" 45381 | \n",
" 36100 | \n",
" 36100 | \n",
"
\n",
" \n",
- " items | \n",
+ " order_items | \n",
" [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... | \n",
" [{\"name\": \"Porção Batata Frita\", \"addition\": {... | \n",
" [{\"name\": \"Porção Batata Frita\", \"addition\": {... | \n",
"
\n",
" \n",
- " merchant_latitude | \n",
+ " order_merchant_latitude | \n",
" -46.68 | \n",
" -46.54 | \n",
" -46.54 | \n",
"
\n",
" \n",
- " merchant_longitude | \n",
+ " order_merchant_longitude | \n",
" -23.59 | \n",
" -23.52 | \n",
" -23.52 | \n",
"
\n",
" \n",
- " merchant_timezone | \n",
+ " order_merchant_timezone | \n",
" America/Sao_Paulo | \n",
" America/Sao_Paulo | \n",
" America/Sao_Paulo | \n",
"
\n",
" \n",
- " order_created_at | \n",
+ " order_order_created_at | \n",
" 2019-01-24 23:04:27 | \n",
" 2019-01-03 18:12:24 | \n",
" 2018-12-04 18:12:24 | \n",
"
\n",
" \n",
- " order_scheduled | \n",
+ " order_order_scheduled | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
- " order_scheduled_date | \n",
+ " order_order_scheduled_date | \n",
" NaT | \n",
" NaT | \n",
" NaT | \n",
"
\n",
" \n",
- " order_total_amount | \n",
+ " order_order_total_amount | \n",
" 27 | \n",
" 17.5 | \n",
" 17.5 | \n",
"
\n",
" \n",
- " origin_platform | \n",
+ " order_origin_platform | \n",
" ANDROID | \n",
" ANDROID | \n",
" ANDROID | \n",
@@ -1054,24 +275,12 @@
" true | \n",
"
\n",
" \n",
- " consumer_customer_name | \n",
- " EDUARDO | \n",
- " BRUNO | \n",
- " BRUNO | \n",
- "
\n",
- " \n",
" consumer_customer_phone_area | \n",
" 83 | \n",
" 60 | \n",
" 60 | \n",
"
\n",
" \n",
- " consumer_customer_phone_number | \n",
- " 020082840 | \n",
- " 109441873 | \n",
- " 109441873 | \n",
- "
\n",
- " \n",
" restaurant_created_at | \n",
" 2017-01-20T13:14:41.451Z | \n",
" 2017-01-20T13:14:16.179Z | \n",
@@ -1160,183 +369,141 @@
""
],
"text/plain": [
- " 0 \\\n",
- "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
- "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n",
- "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n",
- "cpf 16854185492 \n",
- "customer_name EDUARDO \n",
- "delivery_address_city SAO PAULO \n",
- "delivery_address_country BR \n",
- "delivery_address_district ITAIM BIBI \n",
- "delivery_address_external_id 8847122 \n",
- "delivery_address_latitude -46.68 \n",
- "delivery_address_longitude -23.59 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 45381 \n",
- "items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n",
- "merchant_latitude -46.68 \n",
- "merchant_longitude -23.59 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-24 23:04:27 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 27 \n",
- "origin_platform ANDROID \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-04-05T13:20:39.644Z \n",
- "consumer_active true \n",
- "consumer_customer_name EDUARDO \n",
- "consumer_customer_phone_area 83 \n",
- "consumer_customer_phone_number 020082840 \n",
- "restaurant_created_at 2017-01-20T13:14:41.451Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 1 \n",
- "restaurant_average_ticket 30.0 \n",
- "restaurant_takeout_time 0 \n",
- "restaurant_delivery_time 40 \n",
- "restaurant_minimum_order_value 30.0 \n",
- "restaurant_merchant_zip_code 56560 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "status_created_at 2019-01-25 01:05:07 \n",
- "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n",
- "status_value CONCLUDED \n",
+ " 0 \\\n",
+ "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n",
+ "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n",
+ "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n",
+ "order_delivery_address_city SAO PAULO \n",
+ "order_delivery_address_country BR \n",
+ "order_delivery_address_district ITAIM BIBI \n",
+ "delivery_address_external_id 8847122 \n",
+ "order_delivery_address_latitude -46.68 \n",
+ "order_delivery_address_longitude -23.59 \n",
+ "order_delivery_address_state SP \n",
+ "order_delivery_address_zip_code 45381 \n",
+ "order_items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n",
+ "order_merchant_latitude -46.68 \n",
+ "order_merchant_longitude -23.59 \n",
+ "order_merchant_timezone America/Sao_Paulo \n",
+ "order_order_created_at 2019-01-24 23:04:27 \n",
+ "order_order_scheduled False \n",
+ "order_order_scheduled_date NaT \n",
+ "order_order_total_amount 27 \n",
+ "order_origin_platform ANDROID \n",
+ "consumer_language pt-br \n",
+ "consumer_created_at 2018-04-05T13:20:39.644Z \n",
+ "consumer_active true \n",
+ "consumer_customer_phone_area 83 \n",
+ "restaurant_created_at 2017-01-20T13:14:41.451Z \n",
+ "restaurant_enabled true \n",
+ "restaurant_price_range 1 \n",
+ "restaurant_average_ticket 30.0 \n",
+ "restaurant_takeout_time 0 \n",
+ "restaurant_delivery_time 40 \n",
+ "restaurant_minimum_order_value 30.0 \n",
+ "restaurant_merchant_zip_code 56560 \n",
+ "restaurant_merchant_city SAO PAULO \n",
+ "restaurant_merchant_state SP \n",
+ "restaurant_merchant_country BR \n",
+ "status_created_at 2019-01-25 01:05:07 \n",
+ "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n",
+ "status_value CONCLUDED \n",
"\n",
- " 1 \\\n",
- "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
- "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n",
- "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n",
- "cpf 05264105611 \n",
- "customer_name BRUNO \n",
- "delivery_address_city SAO PAULO \n",
- "delivery_address_country BR \n",
- "delivery_address_district PENHA DE FRANCA \n",
- "delivery_address_external_id 7632090 \n",
- "delivery_address_latitude -46.54 \n",
- "delivery_address_longitude -23.52 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 36100 \n",
- "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n",
- "merchant_latitude -46.54 \n",
- "merchant_longitude -23.52 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2019-01-03 18:12:24 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 17.5 \n",
- "origin_platform ANDROID \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-06T14:31:43.348Z \n",
- "consumer_active true \n",
- "consumer_customer_name BRUNO \n",
- "consumer_customer_phone_area 60 \n",
- "consumer_customer_phone_number 109441873 \n",
- "restaurant_created_at 2017-01-20T13:14:16.179Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 1 \n",
- "restaurant_average_ticket 30.0 \n",
- "restaurant_takeout_time 20 \n",
- "restaurant_delivery_time 0 \n",
- "restaurant_minimum_order_value 0.0 \n",
- "restaurant_merchant_zip_code 36350 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "status_created_at 2019-01-03 20:15:06 \n",
- "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n",
- "status_value CONCLUDED \n",
+ " 1 \\\n",
+ "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
+ "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n",
+ "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n",
+ "order_delivery_address_city SAO PAULO \n",
+ "order_delivery_address_country BR \n",
+ "order_delivery_address_district PENHA DE FRANCA \n",
+ "delivery_address_external_id 7632090 \n",
+ "order_delivery_address_latitude -46.54 \n",
+ "order_delivery_address_longitude -23.52 \n",
+ "order_delivery_address_state SP \n",
+ "order_delivery_address_zip_code 36100 \n",
+ "order_items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n",
+ "order_merchant_latitude -46.54 \n",
+ "order_merchant_longitude -23.52 \n",
+ "order_merchant_timezone America/Sao_Paulo \n",
+ "order_order_created_at 2019-01-03 18:12:24 \n",
+ "order_order_scheduled False \n",
+ "order_order_scheduled_date NaT \n",
+ "order_order_total_amount 17.5 \n",
+ "order_origin_platform ANDROID \n",
+ "consumer_language pt-br \n",
+ "consumer_created_at 2018-01-06T14:31:43.348Z \n",
+ "consumer_active true \n",
+ "consumer_customer_phone_area 60 \n",
+ "restaurant_created_at 2017-01-20T13:14:16.179Z \n",
+ "restaurant_enabled true \n",
+ "restaurant_price_range 1 \n",
+ "restaurant_average_ticket 30.0 \n",
+ "restaurant_takeout_time 20 \n",
+ "restaurant_delivery_time 0 \n",
+ "restaurant_minimum_order_value 0.0 \n",
+ "restaurant_merchant_zip_code 36350 \n",
+ "restaurant_merchant_city SAO PAULO \n",
+ "restaurant_merchant_state SP \n",
+ "restaurant_merchant_country BR \n",
+ "status_created_at 2019-01-03 20:15:06 \n",
+ "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n",
+ "status_value CONCLUDED \n",
"\n",
- " 2 \n",
- "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
- "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n",
- "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n",
- "cpf 78163676650 \n",
- "customer_name BRUNO \n",
- "delivery_address_city SAO PAULO \n",
- "delivery_address_country BR \n",
- "delivery_address_district PENHA DE FRANCA \n",
- "delivery_address_external_id 7632090 \n",
- "delivery_address_latitude -46.54 \n",
- "delivery_address_longitude -23.52 \n",
- "delivery_address_state SP \n",
- "delivery_address_zip_code 36100 \n",
- "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n",
- "merchant_latitude -46.54 \n",
- "merchant_longitude -23.52 \n",
- "merchant_timezone America/Sao_Paulo \n",
- "order_created_at 2018-12-04 18:12:24 \n",
- "order_scheduled False \n",
- "order_scheduled_date NaT \n",
- "order_total_amount 17.5 \n",
- "origin_platform ANDROID \n",
- "consumer_language pt-br \n",
- "consumer_created_at 2018-01-06T14:31:43.348Z \n",
- "consumer_active true \n",
- "consumer_customer_name BRUNO \n",
- "consumer_customer_phone_area 60 \n",
- "consumer_customer_phone_number 109441873 \n",
- "restaurant_created_at 2017-01-20T13:14:16.179Z \n",
- "restaurant_enabled true \n",
- "restaurant_price_range 1 \n",
- "restaurant_average_ticket 30.0 \n",
- "restaurant_takeout_time 20 \n",
- "restaurant_delivery_time 0 \n",
- "restaurant_minimum_order_value 0.0 \n",
- "restaurant_merchant_zip_code 36350 \n",
- "restaurant_merchant_city SAO PAULO \n",
- "restaurant_merchant_state SP \n",
- "restaurant_merchant_country BR \n",
- "status_created_at 2019-01-03 20:15:06 \n",
- "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n",
- "status_value CONCLUDED "
+ " 2 \n",
+ "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n",
+ "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n",
+ "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n",
+ "order_delivery_address_city SAO PAULO \n",
+ "order_delivery_address_country BR \n",
+ "order_delivery_address_district PENHA DE FRANCA \n",
+ "delivery_address_external_id 7632090 \n",
+ "order_delivery_address_latitude -46.54 \n",
+ "order_delivery_address_longitude -23.52 \n",
+ "order_delivery_address_state SP \n",
+ "order_delivery_address_zip_code 36100 \n",
+ "order_items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n",
+ "order_merchant_latitude -46.54 \n",
+ "order_merchant_longitude -23.52 \n",
+ "order_merchant_timezone America/Sao_Paulo \n",
+ "order_order_created_at 2018-12-04 18:12:24 \n",
+ "order_order_scheduled False \n",
+ "order_order_scheduled_date NaT \n",
+ "order_order_total_amount 17.5 \n",
+ "order_origin_platform ANDROID \n",
+ "consumer_language pt-br \n",
+ "consumer_created_at 2018-01-06T14:31:43.348Z \n",
+ "consumer_active true \n",
+ "consumer_customer_phone_area 60 \n",
+ "restaurant_created_at 2017-01-20T13:14:16.179Z \n",
+ "restaurant_enabled true \n",
+ "restaurant_price_range 1 \n",
+ "restaurant_average_ticket 30.0 \n",
+ "restaurant_takeout_time 20 \n",
+ "restaurant_delivery_time 0 \n",
+ "restaurant_minimum_order_value 0.0 \n",
+ "restaurant_merchant_zip_code 36350 \n",
+ "restaurant_merchant_city SAO PAULO \n",
+ "restaurant_merchant_state SP \n",
+ "restaurant_merchant_country BR \n",
+ "status_created_at 2019-01-03 20:15:06 \n",
+ "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n",
+ "status_value CONCLUDED "
]
},
- "execution_count": 9,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "tmp = (o_df\n",
- " .join(c_df, on='customer_id', how='left')\n",
- " .join(r_df, on='merchant_id', how='left')\n",
- " .join(s_df, on='order_id', how='left')\n",
- " )\n",
"tmp.limit(3).toPandas().T"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Fix schema and anonymize sensitive data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# for dtype, cols in dtypes.items():\n",
- "# for col in cols:\n",
- "# df = df.withColumn(col, df[col].cast(dtype))"
+ "Order dataset successfully generated!"
]
}
],
diff --git a/dev/docker-volume/src/DataProcessor.py b/dev/docker-volume/src/DataProcessor.py
index dc48e14..6e6d9c6 100644
--- a/dev/docker-volume/src/DataProcessor.py
+++ b/dev/docker-volume/src/DataProcessor.py
@@ -1,7 +1,7 @@
-from src.config import TRUSTED_DATA_PATH, RAW_DATA_PATH
from pyspark.sql import Window
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.session import SparkSession
+from src.config import TRUSTED_DATA_PATH, RAW_DATA_PATH
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, FloatType
from pyspark.sql.functions import from_json, explode, flatten, col, rank, col, monotonically_increasing_id, desc
@@ -13,7 +13,7 @@ def explore_dataframe(df:DataFrame):
print('(#rows, #columns) =', (df.count(), len(df.columns)))
return df.printSchema()
-def fix_schema(df:DataFrame, dtypes:dict):
+def fix_dataframe_dtypes(df:DataFrame, dtypes:dict):
"""
Return DataFrame `df` with corrected schema based on dtypes
"""
@@ -23,15 +23,16 @@ def fix_schema(df:DataFrame, dtypes:dict):
return df
-def fix_order_schema(df):
+def fix_order_dtypes(df:DataFrame):
dtypes = {
'float': [
- 'delivery_address_latitude', 'delivery_address_longitude',
- 'merchant_latitude', 'merchant_longitude', 'order_total_amount'],
+ 'order_delivery_address_latitude', 'order_delivery_address_longitude', 'order_merchant_latitude',
+ 'order_merchant_longitude', 'order_order_total_amount', 'restaurant_price_range',
+ 'restaurant_average_ticket', 'restaurant_takeout_time', 'restaurant_delivery_time'],
'bigint': [
- 'cpf', 'delivery_address_zip_code']
- }
+ 'order_delivery_address_zip_code', 'restaurant_merchant_zip_code'
+ ]}
df = fix_schema(df, dtypes)
@@ -68,7 +69,7 @@ def create_trusted_order_items(spark:SparkSession):
tmp = tmp.dropDuplicates()
output_path = TRUSTED_DATA_PATH / 'order_items'
- print(f'Exporting dataset file system...')
+ print(f'Exporting dataset to file system...')
tmp.write.parquet(str(output_path))
@@ -91,6 +92,9 @@ def extract_latest_values(df:DataFrame, id_col:str, dt_col:str):
return df
def load_sanitized_dataframe(table:str, spark:SparkSession):
+ """
+ Loads DataFrame into standard for produced joind datamart dataset `Order`.
+ """
df = spark.read.parquet(str(RAW_DATA_PATH / table))
df = add_prefix(df, table)
@@ -118,7 +122,7 @@ def create_trusted_order(spark:SparkSession):
.dropDuplicates()
)
- print(f'Exporting dataset file system...')
+ print(f'Exporting dataset to file system...')
output_path = TRUSTED_DATA_PATH / 'order'
@@ -126,7 +130,11 @@ def create_trusted_order(spark:SparkSession):
sensitive_data_columns = ['order_cpf', 'order_customer_name', 'consumer_customer_name', 'consumer_customer_phone_number']
tmp = tmp.drop(*sensitive_data_columns)
- tmp.write.parquet(str(output_path))
+ # fix dataset data types
+ tmp = fix_order_dtypes(tmp)
+
+ # exports data partinioned by merchant's time at order creation
+ tmp.write.partitionBy('order_order_created_at').parquet(str(output_path))
print(f'Dataset sucessfully exported to `{output_path}`!')
diff --git a/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc b/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc
index 2847d51..9486761 100644
Binary files a/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc and b/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc differ