From 8ad2a947ea0c9d2f55ece23f2c3a3e91f1738baf Mon Sep 17 00:00:00 2001 From: murilo Date: Sun, 6 Sep 2020 21:34:10 -0300 Subject: [PATCH] Create order dataset --- .../03-create-trusted-data-order-items.ipynb | 4 +- .../04-create-trusted-data-order-Copy1.ipynb | 1421 +++++++++++++++ .../04-create-trusted-data-order.ipynb | 1613 ++++++----------- dev/docker-volume/src/DataProcessor.py | 63 +- dev/docker-volume/src/IOController.py | 4 +- .../__pycache__/DataProcessor.cpython-38.pyc | Bin 2941 -> 4302 bytes .../__pycache__/IOController.cpython-38.pyc | Bin 2303 -> 2297 bytes 7 files changed, 2025 insertions(+), 1080 deletions(-) create mode 100644 dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb diff --git a/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb b/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb index fae8cdb..3094430 100644 --- a/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb +++ b/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb @@ -39,7 +39,7 @@ "%autoreload 2\n", "from src.config import RAW_DATA_PATH\n", "from src.IOController import create_pyspark_session\n", - "from src.DataProcessor import create_order_items, explore_dataframe" + "from src.DataProcessor import create_trusted_order_items, explore_dataframe" ] }, { @@ -325,7 +325,7 @@ } ], "source": [ - "tmp = create_order_items(df)\n", + "tmp = create_trusted_order_items(spark)\n", "explore_dataframe(tmp)\n", "tmp.limit(3).toPandas().T" ] diff --git a/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb b/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb new file mode 100644 index 0000000..0f4b24d --- /dev/null +++ b/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb @@ -0,0 +1,1421 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "from src.config import RAW_DATA_PATH\n", + "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n", + "from src.IOController import create_pyspark_session" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting PySpark session. Check your terminal for detailed logging...\n", + "PySpark session sucessfully created.\n" + ] + } + ], + "source": [ + "spark = create_pyspark_session()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting processing to generate Order Items dataset...\n", + "Exporting dataset file system...\n", + "Dataset sucessfully exported to `/home/jovyan/data/trusted/order`!\n" + ] + } + ], + "source": [ + "tmp = create_trusted_order(spark)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pyspark.sql.session.SparkSession" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(spark)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp.limit(3).toPandas().T" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(#rows, #columns) = (3683040, 22)\n", + "root\n", + " |-- cpf: string (nullable = true)\n", + " |-- customer_id: string (nullable = true)\n", + " |-- customer_name: string (nullable = true)\n", + " |-- delivery_address_city: string (nullable = true)\n", + " |-- delivery_address_country: string (nullable = true)\n", + " |-- delivery_address_district: string (nullable = true)\n", + " |-- delivery_address_external_id: string (nullable = true)\n", + " |-- delivery_address_latitude: string (nullable = true)\n", + " |-- delivery_address_longitude: string (nullable = true)\n", + " |-- delivery_address_state: string (nullable = true)\n", + " |-- delivery_address_zip_code: string (nullable = true)\n", + " |-- items: string (nullable = true)\n", + " |-- merchant_id: string (nullable = true)\n", + " |-- merchant_latitude: string (nullable = true)\n", + " |-- merchant_longitude: string (nullable = true)\n", + " |-- merchant_timezone: string (nullable = true)\n", + " |-- order_created_at: timestamp (nullable = true)\n", + " |-- order_id: string (nullable = true)\n", + " |-- order_scheduled: boolean (nullable = true)\n", + " |-- order_scheduled_date: timestamp (nullable = true)\n", + " |-- order_total_amount: double (nullable = true)\n", + " |-- origin_platform: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
cpf805321017634335210396138650991217
customer_id977b9a89-825f-464b-8ef6-0f453d7334c1e969cc0d-388b-4025-9351-0db0f718d81ce08dcc8b-f998-405e-b3f2-7107ea8958cf
customer_nameGUSTAVOMICHELLEVICTOR
delivery_address_cityFRANCASANTOSGUARULHOS
delivery_address_countryBRBRBR
delivery_address_districtJARDIM ESPRAIADOCAMPO GRANDEJARDIM ROSSI
delivery_address_external_id673665587592168765930
delivery_address_latitude-47.39-46.34-46.53
delivery_address_longitude-20.55-23.96-23.44
delivery_address_stateSPSPSP
delivery_address_zip_code144031107071304
items[{\"name\": \"Parmegiana de Filé de Frango (2 pes...[{\"name\": \"Filé Mignon à Cubana\", \"addition\": ...[{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va...
merchant_ideb4197f9-964c-4f87-8307-709e498aab87927d46f9-4bb3-48f7-be1d-584deaf18adc71ad62c5-5947-4518-9846-976fbdd2f881
merchant_latitude-47.39-46.34-46.53
merchant_longitude-20.55-23.96-23.44
merchant_timezoneAmerica/Sao_PauloAmerica/Sao_PauloAmerica/Sao_Paulo
order_created_at2019-01-17 22:50:062019-01-17 17:51:262019-01-17 22:53:47
order_iddd4f8f0a-c2cb-45c6-a002-c3be6b305e5f8dd80f0b-db00-4b88-b7e2-02ca706fc5a5430f9887-a563-45ee-8001-1cb29597d9dd
order_scheduledFalseFalseFalse
order_scheduled_dateNaTNaTNaT
order_total_amount46104.535
origin_platformANDROIDANDROIDIOS
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "cpf 80532101763 \n", + "customer_id 977b9a89-825f-464b-8ef6-0f453d7334c1 \n", + "customer_name GUSTAVO \n", + "delivery_address_city FRANCA \n", + "delivery_address_country BR \n", + "delivery_address_district JARDIM ESPRAIADO \n", + "delivery_address_external_id 6736655 \n", + "delivery_address_latitude -47.39 \n", + "delivery_address_longitude -20.55 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 14403 \n", + "items [{\"name\": \"Parmegiana de Filé de Frango (2 pes... \n", + "merchant_id eb4197f9-964c-4f87-8307-709e498aab87 \n", + "merchant_latitude -47.39 \n", + "merchant_longitude -20.55 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-17 22:50:06 \n", + "order_id dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 46 \n", + "origin_platform ANDROID \n", + "\n", + " 1 \\\n", + "cpf 43352103961 \n", + "customer_id e969cc0d-388b-4025-9351-0db0f718d81c \n", + "customer_name MICHELLE \n", + "delivery_address_city SANTOS \n", + "delivery_address_country BR \n", + "delivery_address_district CAMPO GRANDE \n", + "delivery_address_external_id 8759216 \n", + "delivery_address_latitude -46.34 \n", + "delivery_address_longitude -23.96 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 11070 \n", + "items [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... \n", + "merchant_id 927d46f9-4bb3-48f7-be1d-584deaf18adc \n", + "merchant_latitude -46.34 \n", + "merchant_longitude -23.96 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-17 17:51:26 \n", + "order_id 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 104.5 \n", + "origin_platform ANDROID \n", + "\n", + " 2 \n", + "cpf 38650991217 \n", + "customer_id e08dcc8b-f998-405e-b3f2-7107ea8958cf \n", + "customer_name VICTOR \n", + "delivery_address_city GUARULHOS \n", + "delivery_address_country BR \n", + "delivery_address_district JARDIM ROSSI \n", + "delivery_address_external_id 8765930 \n", + "delivery_address_latitude -46.53 \n", + "delivery_address_longitude -23.44 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 71304 \n", + "items [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... \n", + "merchant_id 71ad62c5-5947-4518-9846-976fbdd2f881 \n", + "merchant_latitude -46.53 \n", + "merchant_longitude -23.44 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-17 22:53:47 \n", + "order_id 430f9887-a563-45ee-8001-1cb29597d9dd \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 35 \n", + "origin_platform IOS " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "o_df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n", + "explore_dataframe(o_df)\n", + "o_df.limit(3).toPandas().T" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(#rows, #columns) = (809323, 7)\n", + "root\n", + " |-- customer_id: string (nullable = true)\n", + " |-- language: string (nullable = true)\n", + " |-- created_at: string (nullable = true)\n", + " |-- active: string (nullable = true)\n", + " |-- customer_name: string (nullable = true)\n", + " |-- customer_phone_area: string (nullable = true)\n", + " |-- customer_phone_number: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
customer_id00039466-560f-4e57-85a2-d4753cd901be001a1267-31a3-4f5b-a028-d7e323864b08003ae1d5-67b8-4a04-b055-0e4e9622771a
consumer_languagept-brpt-brpt-br
consumer_created_at2018-04-05T14:49:18.165Z2018-01-14T21:40:02.141Z2018-01-07T03:47:15.554Z
consumer_activetruetruetrue
consumer_customer_nameNUNOADRIELLYPAULA
consumer_customer_phone_area465962
consumer_customer_phone_number816135924231330577347597883
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "customer_id 00039466-560f-4e57-85a2-d4753cd901be \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-04-05T14:49:18.165Z \n", + "consumer_active true \n", + "consumer_customer_name NUNO \n", + "consumer_customer_phone_area 46 \n", + "consumer_customer_phone_number 816135924 \n", + "\n", + " 1 \\\n", + "customer_id 001a1267-31a3-4f5b-a028-d7e323864b08 \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-14T21:40:02.141Z \n", + "consumer_active true \n", + "consumer_customer_name ADRIELLY \n", + "consumer_customer_phone_area 59 \n", + "consumer_customer_phone_number 231330577 \n", + "\n", + " 2 \n", + "customer_id 003ae1d5-67b8-4a04-b055-0e4e9622771a \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-07T03:47:15.554Z \n", + "consumer_active true \n", + "consumer_customer_name PAULA \n", + "consumer_customer_phone_area 62 \n", + "consumer_customer_phone_number 347597883 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table = 'consumer'\n", + "c_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", + "explore_dataframe(c_df)\n", + "c_df = add_prefix(c_df, table)\n", + "c_df.limit(3).toPandas().T" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(#rows, #columns) = (7292, 12)\n", + "root\n", + " |-- id: string (nullable = true)\n", + " |-- created_at: string (nullable = true)\n", + " |-- enabled: string (nullable = true)\n", + " |-- price_range: string (nullable = true)\n", + " |-- average_ticket: string (nullable = true)\n", + " |-- takeout_time: string (nullable = true)\n", + " |-- delivery_time: string (nullable = true)\n", + " |-- minimum_order_value: string (nullable = true)\n", + " |-- merchant_zip_code: string (nullable = true)\n", + " |-- merchant_city: string (nullable = true)\n", + " |-- merchant_state: string (nullable = true)\n", + " |-- merchant_country: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
merchant_id02c94103-61f3-4906-a4a9-55611db9f28c15e7f5fd-090d-47b9-9f14-b6f7fce3c95d33ca5d3d-b99f-404d-84d9-8df8f38a2261
restaurant_created_at2017-01-23T12:52:30.910Z2017-01-20T13:14:48.286Z2017-01-23T12:46:33.457Z
restaurant_enabledfalsetruetrue
restaurant_price_range335
restaurant_average_ticket60.060.0100.0
restaurant_takeout_time000
restaurant_delivery_time50045
restaurant_minimum_order_value30.030.010.0
restaurant_merchant_zip_code140255018023090
restaurant_merchant_cityRIBEIRAO PRETOSAO PAULORIO DE JANEIRO
restaurant_merchant_stateSPSPRJ
restaurant_merchant_countryBRBRBR
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "merchant_id 02c94103-61f3-4906-a4a9-55611db9f28c \n", + "restaurant_created_at 2017-01-23T12:52:30.910Z \n", + "restaurant_enabled false \n", + "restaurant_price_range 3 \n", + "restaurant_average_ticket 60.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 50 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 14025 \n", + "restaurant_merchant_city RIBEIRAO PRETO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "\n", + " 1 \\\n", + "merchant_id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n", + "restaurant_created_at 2017-01-20T13:14:48.286Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 3 \n", + "restaurant_average_ticket 60.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 50180 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "\n", + " 2 \n", + "merchant_id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n", + "restaurant_created_at 2017-01-23T12:46:33.457Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 5 \n", + "restaurant_average_ticket 100.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 45 \n", + "restaurant_minimum_order_value 10.0 \n", + "restaurant_merchant_zip_code 23090 \n", + "restaurant_merchant_city RIO DE JANEIRO \n", + "restaurant_merchant_state RJ \n", + "restaurant_merchant_country BR " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table = 'restaurant'\n", + "r_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", + "explore_dataframe(r_df)\n", + "r_df = r_df.withColumnRenamed('id', 'merchant_id')\n", + "r_df = add_prefix(r_df, table)\n", + "r_df.limit(3).toPandas().T" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(#rows, #columns) = (11075048, 4)\n", + "root\n", + " |-- created_at: timestamp (nullable = true)\n", + " |-- order_id: string (nullable = true)\n", + " |-- status_id: string (nullable = true)\n", + " |-- value: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-24 23:04:270002fe02-d7dc-4232-b7ac-3394019ce2407964bf63-007a-484d-a321-e9118ccc2f97REGISTERED
22019-01-24 23:04:280002fe02-d7dc-4232-b7ac-3394019ce240ca16b92b-db8f-4274-b165-929675541a9fPLACED
\n", + "
" + ], + "text/plain": [ + " status_created_at order_id \\\n", + "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "\n", + " status_id status_value \n", + "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", + "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n", + "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table = 'status'\n", + "s_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", + "explore_dataframe(s_df)\n", + "s_df = add_prefix(s_df, table)\n", + "s_df.limit(3).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(#rows, #columns) = (2441067, 4)\n", + "root\n", + " |-- status_created_at: timestamp (nullable = true)\n", + " |-- order_id: string (nullable = true)\n", + " |-- status_id: string (nullable = true)\n", + " |-- status_value: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-03 20:15:060012d95c-9c4b-4244-86b5-dcf87677dcc1c8058ba4-1d1c-4a1f-8401-a7042ef6ba85CONCLUDED
22019-01-06 16:20:270013fc5c-4c10-4402-886c-1b8166e4632ed0a3ffd5-4e48-4cc4-9739-d5764678c19fCONCLUDED
32019-01-03 15:55:1200251da6-aa45-4512-be58-6622a248cdffa621de13-5272-4c7c-969c-3bcd53f0515fCONCLUDED
42019-01-21 22:20:0200273652-efc8-4e7c-95b2-cd3827900e7e74f8ff33-e731-477f-a698-4755577b80a1CONCLUDED
\n", + "
" + ], + "text/plain": [ + " status_created_at order_id \\\n", + "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n", + "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n", + "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n", + "\n", + " status_id status_value \n", + "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", + "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n", + "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n", + "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n", + "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_df = extract_latest_values(df=s_df, id_col='order_id', dt_col='status_created_at')\n", + "explore_dataframe(s_df)\n", + "s_df.limit(5).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+-------+\n", + "|status_value| count|\n", + "+------------+-------+\n", + "| CONCLUDED|2354218|\n", + "| CANCELLED| 55179|\n", + "| PLACED| 31654|\n", + "| REGISTERED| 16|\n", + "+------------+-------+\n", + "\n" + ] + } + ], + "source": [ + "s_df.groupBy('status_value').count().sort('count', ascending=False).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
order_id0002fe02-d7dc-4232-b7ac-3394019ce2400012d95c-9c4b-4244-86b5-dcf87677dcc10012d95c-9c4b-4244-86b5-dcf87677dcc1
merchant_ide3d24e4d-2f51-4987-8c39-47923b20b9bee66b0dcc-ffa9-42ee-a864-60977672c3efe66b0dcc-ffa9-42ee-a864-60977672c3ef
customer_id97c53c25-bd9b-41cb-8a0d-13cd74509f17e2213649-cede-4770-b6e0-7ac1dd4d3548e2213649-cede-4770-b6e0-7ac1dd4d3548
cpf168541854920526410561178163676650
customer_nameEDUARDOBRUNOBRUNO
delivery_address_citySAO PAULOSAO PAULOSAO PAULO
delivery_address_countryBRBRBR
delivery_address_districtITAIM BIBIPENHA DE FRANCAPENHA DE FRANCA
delivery_address_external_id884712276320907632090
delivery_address_latitude-46.68-46.54-46.54
delivery_address_longitude-23.59-23.52-23.52
delivery_address_stateSPSPSP
delivery_address_zip_code453813610036100
items[{\"name\": \"Pastel Frangolino\", \"addition\": {\"v...[{\"name\": \"Porção Batata Frita\", \"addition\": {...[{\"name\": \"Porção Batata Frita\", \"addition\": {...
merchant_latitude-46.68-46.54-46.54
merchant_longitude-23.59-23.52-23.52
merchant_timezoneAmerica/Sao_PauloAmerica/Sao_PauloAmerica/Sao_Paulo
order_created_at2019-01-24 23:04:272019-01-03 18:12:242018-12-04 18:12:24
order_scheduledFalseFalseFalse
order_scheduled_dateNaTNaTNaT
order_total_amount2717.517.5
origin_platformANDROIDANDROIDANDROID
consumer_languagept-brpt-brpt-br
consumer_created_at2018-04-05T13:20:39.644Z2018-01-06T14:31:43.348Z2018-01-06T14:31:43.348Z
consumer_activetruetruetrue
consumer_customer_nameEDUARDOBRUNOBRUNO
consumer_customer_phone_area836060
consumer_customer_phone_number020082840109441873109441873
restaurant_created_at2017-01-20T13:14:41.451Z2017-01-20T13:14:16.179Z2017-01-20T13:14:16.179Z
restaurant_enabledtruetruetrue
restaurant_price_range111
restaurant_average_ticket30.030.030.0
restaurant_takeout_time02020
restaurant_delivery_time4000
restaurant_minimum_order_value30.00.00.0
restaurant_merchant_zip_code565603635036350
restaurant_merchant_citySAO PAULOSAO PAULOSAO PAULO
restaurant_merchant_stateSPSPSP
restaurant_merchant_countryBRBRBR
status_created_at2019-01-25 01:05:072019-01-03 20:15:062019-01-03 20:15:06
status_idb4298862-fa38-499a-93e2-a76930fb2bcec8058ba4-1d1c-4a1f-8401-a7042ef6ba85c8058ba4-1d1c-4a1f-8401-a7042ef6ba85
status_valueCONCLUDEDCONCLUDEDCONCLUDED
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n", + "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n", + "cpf 16854185492 \n", + "customer_name EDUARDO \n", + "delivery_address_city SAO PAULO \n", + "delivery_address_country BR \n", + "delivery_address_district ITAIM BIBI \n", + "delivery_address_external_id 8847122 \n", + "delivery_address_latitude -46.68 \n", + "delivery_address_longitude -23.59 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 45381 \n", + "items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n", + "merchant_latitude -46.68 \n", + "merchant_longitude -23.59 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-24 23:04:27 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 27 \n", + "origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-04-05T13:20:39.644Z \n", + "consumer_active true \n", + "consumer_customer_name EDUARDO \n", + "consumer_customer_phone_area 83 \n", + "consumer_customer_phone_number 020082840 \n", + "restaurant_created_at 2017-01-20T13:14:41.451Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 40 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 56560 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-25 01:05:07 \n", + "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n", + "status_value CONCLUDED \n", + "\n", + " 1 \\\n", + "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", + "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", + "cpf 05264105611 \n", + "customer_name BRUNO \n", + "delivery_address_city SAO PAULO \n", + "delivery_address_country BR \n", + "delivery_address_district PENHA DE FRANCA \n", + "delivery_address_external_id 7632090 \n", + "delivery_address_latitude -46.54 \n", + "delivery_address_longitude -23.52 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 36100 \n", + "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", + "merchant_latitude -46.54 \n", + "merchant_longitude -23.52 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-03 18:12:24 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 17.5 \n", + "origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-06T14:31:43.348Z \n", + "consumer_active true \n", + "consumer_customer_name BRUNO \n", + "consumer_customer_phone_area 60 \n", + "consumer_customer_phone_number 109441873 \n", + "restaurant_created_at 2017-01-20T13:14:16.179Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 20 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 0.0 \n", + "restaurant_merchant_zip_code 36350 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-03 20:15:06 \n", + "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", + "status_value CONCLUDED \n", + "\n", + " 2 \n", + "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", + "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", + "cpf 78163676650 \n", + "customer_name BRUNO \n", + "delivery_address_city SAO PAULO \n", + "delivery_address_country BR \n", + "delivery_address_district PENHA DE FRANCA \n", + "delivery_address_external_id 7632090 \n", + "delivery_address_latitude -46.54 \n", + "delivery_address_longitude -23.52 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 36100 \n", + "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", + "merchant_latitude -46.54 \n", + "merchant_longitude -23.52 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2018-12-04 18:12:24 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 17.5 \n", + "origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-06T14:31:43.348Z \n", + "consumer_active true \n", + "consumer_customer_name BRUNO \n", + "consumer_customer_phone_area 60 \n", + "consumer_customer_phone_number 109441873 \n", + "restaurant_created_at 2017-01-20T13:14:16.179Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 20 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 0.0 \n", + "restaurant_merchant_zip_code 36350 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-03 20:15:06 \n", + "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", + "status_value CONCLUDED " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = (o_df\n", + " .join(c_df, on='customer_id', how='left')\n", + " .join(r_df, on='merchant_id', how='left')\n", + " .join(s_df, on='order_id', how='left')\n", + " )\n", + "tmp.limit(3).toPandas().T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fix schema and anonymize sensitive data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# for dtype, cols in dtypes.items():\n", + "# for col in cols:\n", + "# df = df.withColumn(col, df[col].cast(dtype))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb b/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb index 0c0876c..31255a6 100644 --- a/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb +++ b/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb @@ -2,23 +2,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "from src.config import RAW_DATA_PATH\n", - "from src.DataProcessor import create_order_items, explore_dataframe\n", + "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n", "from src.IOController import create_pyspark_session" ] }, @@ -42,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -313,7 +304,7 @@ "origin_platform IOS " ] }, - "execution_count": 15, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -326,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -366,44 +357,52 @@ " \n", " \n", " \n", - " customer_id\n", - " language\n", - " created_at\n", - " active\n", - " customer_name\n", - " customer_phone_area\n", - " customer_phone_number\n", + " 0\n", + " 1\n", + " 2\n", " \n", " \n", " \n", " \n", - " 0\n", + " customer_id\n", " 00039466-560f-4e57-85a2-d4753cd901be\n", - " pt-br\n", - " 2018-04-05T14:49:18.165Z\n", - " true\n", - " NUNO\n", - " 46\n", - " 816135924\n", + " 001a1267-31a3-4f5b-a028-d7e323864b08\n", + " 003ae1d5-67b8-4a04-b055-0e4e9622771a\n", " \n", " \n", - " 1\n", - " 001a1267-31a3-4f5b-a028-d7e323864b08\n", + " consumer_language\n", + " pt-br\n", + " pt-br\n", " pt-br\n", - " 2018-01-14T21:40:02.141Z\n", - " true\n", - " ADRIELLY\n", - " 59\n", - " 231330577\n", " \n", " \n", - " 2\n", - " 003ae1d5-67b8-4a04-b055-0e4e9622771a\n", - " pt-br\n", + " consumer_created_at\n", + " 2018-04-05T14:49:18.165Z\n", + " 2018-01-14T21:40:02.141Z\n", " 2018-01-07T03:47:15.554Z\n", + " \n", + " \n", + " consumer_active\n", + " true\n", " true\n", + " true\n", + " \n", + " \n", + " consumer_customer_name\n", + " NUNO\n", + " ADRIELLY\n", " PAULA\n", + " \n", + " \n", + " consumer_customer_phone_area\n", + " 46\n", + " 59\n", " 62\n", + " \n", + " \n", + " consumer_customer_phone_number\n", + " 816135924\n", + " 231330577\n", " 347597883\n", " \n", " \n", @@ -411,31 +410,50 @@ "" ], "text/plain": [ - " customer_id language created_at \\\n", - "0 00039466-560f-4e57-85a2-d4753cd901be pt-br 2018-04-05T14:49:18.165Z \n", - "1 001a1267-31a3-4f5b-a028-d7e323864b08 pt-br 2018-01-14T21:40:02.141Z \n", - "2 003ae1d5-67b8-4a04-b055-0e4e9622771a pt-br 2018-01-07T03:47:15.554Z \n", + " 0 \\\n", + "customer_id 00039466-560f-4e57-85a2-d4753cd901be \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-04-05T14:49:18.165Z \n", + "consumer_active true \n", + "consumer_customer_name NUNO \n", + "consumer_customer_phone_area 46 \n", + "consumer_customer_phone_number 816135924 \n", + "\n", + " 1 \\\n", + "customer_id 001a1267-31a3-4f5b-a028-d7e323864b08 \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-14T21:40:02.141Z \n", + "consumer_active true \n", + "consumer_customer_name ADRIELLY \n", + "consumer_customer_phone_area 59 \n", + "consumer_customer_phone_number 231330577 \n", "\n", - " active customer_name customer_phone_area customer_phone_number \n", - "0 true NUNO 46 816135924 \n", - "1 true ADRIELLY 59 231330577 \n", - "2 true PAULA 62 347597883 " + " 2 \n", + "customer_id 003ae1d5-67b8-4a04-b055-0e4e9622771a \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-07T03:47:15.554Z \n", + "consumer_active true \n", + "consumer_customer_name PAULA \n", + "consumer_customer_phone_area 62 \n", + "consumer_customer_phone_number 347597883 " ] }, - "execution_count": 14, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c_df = spark.read.parquet(str(RAW_DATA_PATH / 'consumer'))\n", + "table = 'consumer'\n", + "c_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", "explore_dataframe(c_df)\n", - "c_df.limit(3).toPandas()" + "c_df = add_prefix(c_df, table)\n", + "c_df.limit(3).toPandas().T" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -487,73 +505,73 @@ " \n", " \n", " \n", - " id\n", + " merchant_id\n", " 02c94103-61f3-4906-a4a9-55611db9f28c\n", " 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d\n", " 33ca5d3d-b99f-404d-84d9-8df8f38a2261\n", " \n", " \n", - " created_at\n", + " restaurant_created_at\n", " 2017-01-23T12:52:30.910Z\n", " 2017-01-20T13:14:48.286Z\n", " 2017-01-23T12:46:33.457Z\n", " \n", " \n", - " enabled\n", + " restaurant_enabled\n", " false\n", " true\n", " true\n", " \n", " \n", - " price_range\n", + " restaurant_price_range\n", " 3\n", " 3\n", " 5\n", " \n", " \n", - " average_ticket\n", + " restaurant_average_ticket\n", " 60.0\n", " 60.0\n", " 100.0\n", " \n", " \n", - " takeout_time\n", + " restaurant_takeout_time\n", " 0\n", " 0\n", " 0\n", " \n", " \n", - " delivery_time\n", + " restaurant_delivery_time\n", " 50\n", " 0\n", " 45\n", " \n", " \n", - " minimum_order_value\n", + " restaurant_minimum_order_value\n", " 30.0\n", " 30.0\n", " 10.0\n", " \n", " \n", - " merchant_zip_code\n", + " restaurant_merchant_zip_code\n", " 14025\n", " 50180\n", " 23090\n", " \n", " \n", - " merchant_city\n", + " restaurant_merchant_city\n", " RIBEIRAO PRETO\n", " SAO PAULO\n", " RIO DE JANEIRO\n", " \n", " \n", - " merchant_state\n", + " restaurant_merchant_state\n", " SP\n", " SP\n", " RJ\n", " \n", " \n", - " merchant_country\n", + " restaurant_merchant_country\n", " BR\n", " BR\n", " BR\n", @@ -563,63 +581,66 @@ "" ], "text/plain": [ - " 0 \\\n", - "id 02c94103-61f3-4906-a4a9-55611db9f28c \n", - "created_at 2017-01-23T12:52:30.910Z \n", - "enabled false \n", - "price_range 3 \n", - "average_ticket 60.0 \n", - "takeout_time 0 \n", - "delivery_time 50 \n", - "minimum_order_value 30.0 \n", - "merchant_zip_code 14025 \n", - "merchant_city RIBEIRAO PRETO \n", - "merchant_state SP \n", - "merchant_country BR \n", + " 0 \\\n", + "merchant_id 02c94103-61f3-4906-a4a9-55611db9f28c \n", + "restaurant_created_at 2017-01-23T12:52:30.910Z \n", + "restaurant_enabled false \n", + "restaurant_price_range 3 \n", + "restaurant_average_ticket 60.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 50 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 14025 \n", + "restaurant_merchant_city RIBEIRAO PRETO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", "\n", - " 1 \\\n", - "id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n", - "created_at 2017-01-20T13:14:48.286Z \n", - "enabled true \n", - "price_range 3 \n", - "average_ticket 60.0 \n", - "takeout_time 0 \n", - "delivery_time 0 \n", - "minimum_order_value 30.0 \n", - "merchant_zip_code 50180 \n", - "merchant_city SAO PAULO \n", - "merchant_state SP \n", - "merchant_country BR \n", + " 1 \\\n", + "merchant_id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n", + "restaurant_created_at 2017-01-20T13:14:48.286Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 3 \n", + "restaurant_average_ticket 60.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 50180 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", "\n", - " 2 \n", - "id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n", - "created_at 2017-01-23T12:46:33.457Z \n", - "enabled true \n", - "price_range 5 \n", - "average_ticket 100.0 \n", - "takeout_time 0 \n", - "delivery_time 45 \n", - "minimum_order_value 10.0 \n", - "merchant_zip_code 23090 \n", - "merchant_city RIO DE JANEIRO \n", - "merchant_state RJ \n", - "merchant_country BR " + " 2 \n", + "merchant_id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n", + "restaurant_created_at 2017-01-23T12:46:33.457Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 5 \n", + "restaurant_average_ticket 100.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 45 \n", + "restaurant_minimum_order_value 10.0 \n", + "restaurant_merchant_zip_code 23090 \n", + "restaurant_merchant_city RIO DE JANEIRO \n", + "restaurant_merchant_state RJ \n", + "restaurant_merchant_country BR " ] }, - "execution_count": 17, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "r_df = spark.read.parquet(str(RAW_DATA_PATH / 'restaurant'))\n", + "table = 'restaurant'\n", + "r_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", "explore_dataframe(r_df)\n", + "r_df = r_df.withColumnRenamed('id', 'merchant_id')\n", + "r_df = add_prefix(r_df, table)\n", "r_df.limit(3).toPandas().T" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -656,10 +677,10 @@ " \n", " \n", " \n", - " created_at\n", + " status_created_at\n", " order_id\n", " status_id\n", - " value\n", + " status_value\n", " \n", " \n", " \n", @@ -689,73 +710,170 @@ "" ], "text/plain": [ - " created_at order_id \\\n", + " status_created_at order_id \\\n", "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", "\n", - " status_id value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n", - "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED " + " status_id status_value \n", + "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", + "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n", + "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED " ] }, - "execution_count": 74, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "s_df = spark.read.parquet(str(RAW_DATA_PATH / 'status'))\n", + "table = 'status'\n", + "s_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", "explore_dataframe(s_df)\n", + "s_df = add_prefix(s_df, table)\n", "s_df.limit(3).toPandas()" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(#rows, #columns) = (3683040, 22)\n", + "(#rows, #columns) = (2441067, 4)\n", "root\n", - " |-- cpf: string (nullable = true)\n", - " |-- customer_id: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- delivery_address_city: string (nullable = true)\n", - " |-- delivery_address_country: string (nullable = true)\n", - " |-- delivery_address_district: string (nullable = true)\n", - " |-- delivery_address_external_id: string (nullable = true)\n", - " |-- delivery_address_latitude: string (nullable = true)\n", - " |-- delivery_address_longitude: string (nullable = true)\n", - " |-- delivery_address_state: string (nullable = true)\n", - " |-- delivery_address_zip_code: string (nullable = true)\n", - " |-- items: string (nullable = true)\n", - " |-- merchant_id: string (nullable = true)\n", - " |-- merchant_latitude: string (nullable = true)\n", - " |-- merchant_longitude: string (nullable = true)\n", - " |-- merchant_timezone: string (nullable = true)\n", - " |-- order_created_at: timestamp (nullable = true)\n", + " |-- status_created_at: timestamp (nullable = true)\n", " |-- order_id: string (nullable = true)\n", - " |-- order_scheduled: boolean (nullable = true)\n", - " |-- order_scheduled_date: timestamp (nullable = true)\n", - " |-- order_total_amount: double (nullable = true)\n", - " |-- origin_platform: string (nullable = true)\n", + " |-- status_id: string (nullable = true)\n", + " |-- status_value: string (nullable = true)\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-03 20:15:060012d95c-9c4b-4244-86b5-dcf87677dcc1c8058ba4-1d1c-4a1f-8401-a7042ef6ba85CONCLUDED
22019-01-06 16:20:270013fc5c-4c10-4402-886c-1b8166e4632ed0a3ffd5-4e48-4cc4-9739-d5764678c19fCONCLUDED
32019-01-03 15:55:1200251da6-aa45-4512-be58-6622a248cdffa621de13-5272-4c7c-969c-3bcd53f0515fCONCLUDED
42019-01-21 22:20:0200273652-efc8-4e7c-95b2-cd3827900e7e74f8ff33-e731-477f-a698-4755577b80a1CONCLUDED
\n", + "
" + ], + "text/plain": [ + " status_created_at order_id \\\n", + "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n", + "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n", + "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n", + "\n", + " status_id status_value \n", + "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", + "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n", + "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n", + "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n", + "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s_df = extract_latest_values(df=s_df, id_col='order_id', dt_col='status_created_at')\n", + "explore_dataframe(s_df)\n", + "s_df.limit(5).toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+-------+\n", + "|status_value| count|\n", + "+------------+-------+\n", + "| CONCLUDED|2354218|\n", + "| CANCELLED| 55179|\n", + "| PLACED| 31654|\n", + "| REGISTERED| 16|\n", + "+------------+-------+\n", "\n" ] } ], "source": [ - "df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n", - "explore_dataframe(df)" + "s_df.groupBy('status_value').count().sort('count', ascending=False).show()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -786,28 +904,40 @@ " \n", " \n", " \n", - " cpf\n", - " 80532101763\n", - " 43352103961\n", - " 38650991217\n", + " order_id\n", + " 0002fe02-d7dc-4232-b7ac-3394019ce240\n", + " 0012d95c-9c4b-4244-86b5-dcf87677dcc1\n", + " 0012d95c-9c4b-4244-86b5-dcf87677dcc1\n", + " \n", + " \n", + " merchant_id\n", + " e3d24e4d-2f51-4987-8c39-47923b20b9be\n", + " e66b0dcc-ffa9-42ee-a864-60977672c3ef\n", + " e66b0dcc-ffa9-42ee-a864-60977672c3ef\n", " \n", " \n", " customer_id\n", - " 977b9a89-825f-464b-8ef6-0f453d7334c1\n", - " e969cc0d-388b-4025-9351-0db0f718d81c\n", - " e08dcc8b-f998-405e-b3f2-7107ea8958cf\n", + " 97c53c25-bd9b-41cb-8a0d-13cd74509f17\n", + " e2213649-cede-4770-b6e0-7ac1dd4d3548\n", + " e2213649-cede-4770-b6e0-7ac1dd4d3548\n", + " \n", + " \n", + " cpf\n", + " 16854185492\n", + " 05264105611\n", + " 78163676650\n", " \n", " \n", " customer_name\n", - " GUSTAVO\n", - " MICHELLE\n", - " VICTOR\n", + " EDUARDO\n", + " BRUNO\n", + " BRUNO\n", " \n", " \n", " delivery_address_city\n", - " FRANCA\n", - " SANTOS\n", - " GUARULHOS\n", + " SAO PAULO\n", + " SAO PAULO\n", + " SAO PAULO\n", " \n", " \n", " delivery_address_country\n", @@ -817,27 +947,27 @@ " \n", " \n", " delivery_address_district\n", - " JARDIM ESPRAIADO\n", - " CAMPO GRANDE\n", - " JARDIM ROSSI\n", + " ITAIM BIBI\n", + " PENHA DE FRANCA\n", + " PENHA DE FRANCA\n", " \n", " \n", " delivery_address_external_id\n", - " 6736655\n", - " 8759216\n", - " 8765930\n", + " 8847122\n", + " 7632090\n", + " 7632090\n", " \n", " \n", " delivery_address_latitude\n", - " -47.39\n", - " -46.34\n", - " -46.53\n", + " -46.68\n", + " -46.54\n", + " -46.54\n", " \n", " \n", " delivery_address_longitude\n", - " -20.55\n", - " -23.96\n", - " -23.44\n", + " -23.59\n", + " -23.52\n", + " -23.52\n", " \n", " \n", " delivery_address_state\n", @@ -847,33 +977,27 @@ " \n", " \n", " delivery_address_zip_code\n", - " 14403\n", - " 11070\n", - " 71304\n", + " 45381\n", + " 36100\n", + " 36100\n", " \n", " \n", " items\n", - " [{\"name\": \"Parmegiana de Filé de Frango (2 pes...\n", - " [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ...\n", - " [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va...\n", - " \n", - " \n", - " merchant_id\n", - " eb4197f9-964c-4f87-8307-709e498aab87\n", - " 927d46f9-4bb3-48f7-be1d-584deaf18adc\n", - " 71ad62c5-5947-4518-9846-976fbdd2f881\n", + " [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v...\n", + " [{\"name\": \"Porção Batata Frita\", \"addition\": {...\n", + " [{\"name\": \"Porção Batata Frita\", \"addition\": {...\n", " \n", " \n", " merchant_latitude\n", - " -47.39\n", - " -46.34\n", - " -46.53\n", + " -46.68\n", + " -46.54\n", + " -46.54\n", " \n", " \n", " merchant_longitude\n", - " -20.55\n", - " -23.96\n", - " -23.44\n", + " -23.59\n", + " -23.52\n", + " -23.52\n", " \n", " \n", " merchant_timezone\n", @@ -883,15 +1007,9 @@ " \n", " \n", " order_created_at\n", - " 2019-01-17 22:50:06\n", - " 2019-01-17 17:51:26\n", - " 2019-01-17 22:53:47\n", - " \n", - " \n", - " order_id\n", - " dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f\n", - " 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5\n", - " 430f9887-a563-45ee-8001-1cb29597d9dd\n", + " 2019-01-24 23:04:27\n", + " 2019-01-03 18:12:24\n", + " 2018-12-04 18:12:24\n", " \n", " \n", " order_scheduled\n", @@ -907,809 +1025,286 @@ " \n", " \n", " order_total_amount\n", - " 46\n", - " 104.5\n", - " 35\n", + " 27\n", + " 17.5\n", + " 17.5\n", " \n", " \n", " origin_platform\n", " ANDROID\n", " ANDROID\n", - " IOS\n", + " ANDROID\n", " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " 0 \\\n", - "cpf 80532101763 \n", - "customer_id 977b9a89-825f-464b-8ef6-0f453d7334c1 \n", - "customer_name GUSTAVO \n", - "delivery_address_city FRANCA \n", - "delivery_address_country BR \n", - "delivery_address_district JARDIM ESPRAIADO \n", - "delivery_address_external_id 6736655 \n", - "delivery_address_latitude -47.39 \n", - "delivery_address_longitude -20.55 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 14403 \n", - "items [{\"name\": \"Parmegiana de Filé de Frango (2 pes... \n", - "merchant_id eb4197f9-964c-4f87-8307-709e498aab87 \n", - "merchant_latitude -47.39 \n", - "merchant_longitude -20.55 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 22:50:06 \n", - "order_id dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 46 \n", - "origin_platform ANDROID \n", - "\n", - " 1 \\\n", - "cpf 43352103961 \n", - "customer_id e969cc0d-388b-4025-9351-0db0f718d81c \n", - "customer_name MICHELLE \n", - "delivery_address_city SANTOS \n", - "delivery_address_country BR \n", - "delivery_address_district CAMPO GRANDE \n", - "delivery_address_external_id 8759216 \n", - "delivery_address_latitude -46.34 \n", - "delivery_address_longitude -23.96 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 11070 \n", - "items [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... \n", - "merchant_id 927d46f9-4bb3-48f7-be1d-584deaf18adc \n", - "merchant_latitude -46.34 \n", - "merchant_longitude -23.96 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 17:51:26 \n", - "order_id 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 104.5 \n", - "origin_platform ANDROID \n", - "\n", - " 2 \n", - "cpf 38650991217 \n", - "customer_id e08dcc8b-f998-405e-b3f2-7107ea8958cf \n", - "customer_name VICTOR \n", - "delivery_address_city GUARULHOS \n", - "delivery_address_country BR \n", - "delivery_address_district JARDIM ROSSI \n", - "delivery_address_external_id 8765930 \n", - "delivery_address_latitude -46.53 \n", - "delivery_address_longitude -23.44 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 71304 \n", - "items [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... \n", - "merchant_id 71ad62c5-5947-4518-9846-976fbdd2f881 \n", - "merchant_latitude -46.53 \n", - "merchant_longitude -23.44 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 22:53:47 \n", - "order_id 430f9887-a563-45ee-8001-1cb29597d9dd \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 35 \n", - "origin_platform IOS " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.limit(3).toPandas().T" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "sensitive_data_columns = [\n", - " 'cpf',\n", - " 'customer_name',\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[created_at: timestamp, order_id: string, status_id: string, value: string]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_df.dropDuplicates(subset=['order_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
created_atorder_idstatus_idvalue
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-03 20:15:060012d95c-9c4b-4244-86b5-dcf87677dcc1c8058ba4-1d1c-4a1f-8401-a7042ef6ba85CONCLUDED
22019-01-06 16:20:270013fc5c-4c10-4402-886c-1b8166e4632ed0a3ffd5-4e48-4cc4-9739-d5764678c19fCONCLUDED
32019-01-03 15:55:1200251da6-aa45-4512-be58-6622a248cdffa621de13-5272-4c7c-969c-3bcd53f0515fCONCLUDED
42019-01-21 22:20:0200273652-efc8-4e7c-95b2-cd3827900e7e74f8ff33-e731-477f-a698-4755577b80a1CONCLUDED
\n", - "
" - ], - "text/plain": [ - " created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n", - "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n", - "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n", - "\n", - " status_id value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n", - "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n", - "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n", - "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED " - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_df.sort('created_at', ascending=False).drop_duplicates(subset=['order_id']).limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
order_idmax(created_at_unix)
0000013ce-ce11-43fe-a6ac-38e616e1982a1548901207
10000149b-2294-4a99-8ced-9af03363b7091547074208
200001892-6731-4eab-96e4-e60d88814e931547337003
300001cdb-2399-417f-b630-f87919d25eaa1548731106
4000021b4-e566-4bf8-ad3a-6da99fc2987e1548207305
\n", - "
" - ], - "text/plain": [ - " order_id max(created_at_unix)\n", - "0 000013ce-ce11-43fe-a6ac-38e616e1982a 1548901207\n", - "1 0000149b-2294-4a99-8ced-9af03363b709 1547074208\n", - "2 00001892-6731-4eab-96e4-e60d88814e93 1547337003\n", - "3 00001cdb-2399-417f-b630-f87919d25eaa 1548731106\n", - "4 000021b4-e566-4bf8-ad3a-6da99fc2987e 1548207305" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pyspark.sql.functions import unix_timestamp\n", - "# s_df = s_df.withColumn('created_at_unix', unix_timestamp('created_at'))\n", - "# s_df.groupBy(['order_id']).max('created_at_unix').sort('order_id').limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
created_atorder_idstatus_idvalueunix_created_at
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED1548378307
12019-01-24 23:04:270002fe02-d7dc-4232-b7ac-3394019ce2407964bf63-007a-484d-a321-e9118ccc2f97REGISTERED1548371067
22019-01-24 23:04:280002fe02-d7dc-4232-b7ac-3394019ce240ca16b92b-db8f-4274-b165-929675541a9fPLACED1548371068
32019-01-18 00:45:02000cef8c-83c7-49eb-a0fb-404e6dc2150ebf43cc29-c3c1-4f3a-9a6c-deb902ca286cCONCLUDED1547772302
42019-01-18 00:45:02000cef8c-83c7-49eb-a0fb-404e6dc2150ebf43cc29-c3c1-4f3a-9a6c-deb902ca286cCONCLUDED1547772302
\n", - "
" - ], - "text/plain": [ - " created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "3 2019-01-18 00:45:02 000cef8c-83c7-49eb-a0fb-404e6dc2150e \n", - "4 2019-01-18 00:45:02 000cef8c-83c7-49eb-a0fb-404e6dc2150e \n", - "\n", - " status_id value unix_created_at \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED 1548378307 \n", - "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED 1548371067 \n", - "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED 1548371068 \n", - "3 bf43cc29-c3c1-4f3a-9a6c-deb902ca286c CONCLUDED 1547772302 \n", - "4 bf43cc29-c3c1-4f3a-9a6c-deb902ca286c CONCLUDED 1547772302 " - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_df.limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
created_atorder_idstatus_idvalue
02019-01-31 00:19:37000013ce-ce11-43fe-a6ac-38e616e1982a0541549e-e2cc-4ce9-b71e-95460f0c1499REGISTERED
12019-01-31 00:19:38000013ce-ce11-43fe-a6ac-38e616e1982ab23fe957-4f5b-401a-bfe0-219d912a4e75PLACEDconsumer_languagept-brpt-brpt-br
22019-01-31 02:20:07000013ce-ce11-43fe-a6ac-38e616e1982ae58f1041-beb3-4298-8a6f-b1dedcadeac2CONCLUDEDconsumer_created_at2018-04-05T13:20:39.644Z2018-01-06T14:31:43.348Z2018-01-06T14:31:43.348Z
32019-01-09 20:46:530000149b-2294-4a99-8ced-9af03363b709d97e8616-e71a-4221-b86e-77f0c0fdad20REGISTEREDconsumer_activetruetruetrue
42019-01-09 20:46:540000149b-2294-4a99-8ced-9af03363b7092e566672-0ccc-4188-899f-89796fcfd9a9PLACED
\n", - "
" - ], - "text/plain": [ - " created_at order_id \\\n", - "0 2019-01-31 00:19:37 000013ce-ce11-43fe-a6ac-38e616e1982a \n", - "1 2019-01-31 00:19:38 000013ce-ce11-43fe-a6ac-38e616e1982a \n", - "2 2019-01-31 02:20:07 000013ce-ce11-43fe-a6ac-38e616e1982a \n", - "3 2019-01-09 20:46:53 0000149b-2294-4a99-8ced-9af03363b709 \n", - "4 2019-01-09 20:46:54 0000149b-2294-4a99-8ced-9af03363b709 \n", - "\n", - " status_id value \n", - "0 0541549e-e2cc-4ce9-b71e-95460f0c1499 REGISTERED \n", - "1 b23fe957-4f5b-401a-bfe0-219d912a4e75 PLACED \n", - "2 e58f1041-beb3-4298-8a6f-b1dedcadeac2 CONCLUDED \n", - "3 d97e8616-e71a-4221-b86e-77f0c0fdad20 REGISTERED \n", - "4 2e566672-0ccc-4188-899f-89796fcfd9a9 PLACED " - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_df.sort(['order_id', 'created_at']).limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[created_at: timestamp, order_id: string, status_id: string, value: string]" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_df.coun" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "
created_atorder_idstatus_idvalueconsumer_customer_nameEDUARDOBRUNOBRUNO
02019-01-24 23:04:270002fe02-d7dc-4232-b7ac-3394019ce2407964bf63-007a-484d-a321-e9118ccc2f97REGISTEREDconsumer_customer_phone_area836060
12019-01-03 18:12:230012d95c-9c4b-4244-86b5-dcf87677dcc1e5791af8-fa08-4333-8487-194be9fcfc4fREGISTEREDconsumer_customer_phone_number020082840109441873109441873
22019-01-06 14:16:170013fc5c-4c10-4402-886c-1b8166e4632e6b36c2fc-cbb5-4bd5-8ce7-7686de612518REGISTERED
restaurant_created_at2017-01-20T13:14:41.451Z2017-01-20T13:14:16.179Z2017-01-20T13:14:16.179Z
32019-01-03 13:50:5400251da6-aa45-4512-be58-6622a248cdff155ac934-fafa-492f-a74b-d885d94379d3REGISTEREDrestaurant_enabledtruetruetrue
42019-01-21 20:17:5400273652-efc8-4e7c-95b2-cd3827900e7eac143627-f177-446c-885b-3b1025a619eeREGISTEREDrestaurant_price_range111
\n", - "
" - ], - "text/plain": [ - " created_at order_id \\\n", - "0 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-03 18:12:23 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "2 2019-01-06 14:16:17 0013fc5c-4c10-4402-886c-1b8166e4632e \n", - "3 2019-01-03 13:50:54 00251da6-aa45-4512-be58-6622a248cdff \n", - "4 2019-01-21 20:17:54 00273652-efc8-4e7c-95b2-cd3827900e7e \n", - "\n", - " status_id value \n", - "0 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n", - "1 e5791af8-fa08-4333-8487-194be9fcfc4f REGISTERED \n", - "2 6b36c2fc-cbb5-4bd5-8ce7-7686de612518 REGISTERED \n", - "3 155ac934-fafa-492f-a74b-d885d94379d3 REGISTERED \n", - "4 ac143627-f177-446c-885b-3b1025a619ee REGISTERED " - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from src.DataProcessor import extract_latest_value\n", - "extract_latest_value(df=s_df, id_col='order_id', dt_col='created_at').limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "
created_atorder_idstatus_idvalue
restaurant_average_ticket30.030.030.0
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDEDrestaurant_takeout_time02020
12019-01-03 20:15:060012d95c-9c4b-4244-86b5-dcf87677dcc1c8058ba4-1d1c-4a1f-8401-a7042ef6ba85CONCLUDEDrestaurant_delivery_time4000
22019-01-06 16:20:270013fc5c-4c10-4402-886c-1b8166e4632ed0a3ffd5-4e48-4cc4-9739-d5764678c19fCONCLUDEDrestaurant_minimum_order_value30.00.00.0
32019-01-03 15:55:1200251da6-aa45-4512-be58-6622a248cdffa621de13-5272-4c7c-969c-3bcd53f0515fCONCLUDEDrestaurant_merchant_zip_code565603635036350
42019-01-21 22:20:0200273652-efc8-4e7c-95b2-cd3827900e7e74f8ff33-e731-477f-a698-4755577b80a1CONCLUDEDrestaurant_merchant_citySAO PAULOSAO PAULOSAO PAULO
\n", - "
" - ], - "text/plain": [ - " created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n", - "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n", - "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n", - "\n", - " status_id value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n", - "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n", - "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n", - "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED " - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from src.DataProcessor import extract_latest_value\n", - "extract_latest_value(df=s_df, id_col='order_id', dt_col='created_at').limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
valuecount
restaurant_merchant_stateSPSPSP
0CONCLUDED2354218restaurant_merchant_countryBRBRBR
1PLACED31654status_created_at2019-01-25 01:05:072019-01-03 20:15:062019-01-03 20:15:06
2CANCELLED55179status_idb4298862-fa38-499a-93e2-a76930fb2bcec8058ba4-1d1c-4a1f-8401-a7042ef6ba85c8058ba4-1d1c-4a1f-8401-a7042ef6ba85
3REGISTERED16status_valueCONCLUDEDCONCLUDEDCONCLUDED
\n", "
" ], "text/plain": [ - " value count\n", - "0 CONCLUDED 2354218\n", - "1 PLACED 31654\n", - "2 CANCELLED 55179\n", - "3 REGISTERED 16" + " 0 \\\n", + "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n", + "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n", + "cpf 16854185492 \n", + "customer_name EDUARDO \n", + "delivery_address_city SAO PAULO \n", + "delivery_address_country BR \n", + "delivery_address_district ITAIM BIBI \n", + "delivery_address_external_id 8847122 \n", + "delivery_address_latitude -46.68 \n", + "delivery_address_longitude -23.59 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 45381 \n", + "items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n", + "merchant_latitude -46.68 \n", + "merchant_longitude -23.59 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-24 23:04:27 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 27 \n", + "origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-04-05T13:20:39.644Z \n", + "consumer_active true \n", + "consumer_customer_name EDUARDO \n", + "consumer_customer_phone_area 83 \n", + "consumer_customer_phone_number 020082840 \n", + "restaurant_created_at 2017-01-20T13:14:41.451Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 40 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 56560 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-25 01:05:07 \n", + "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n", + "status_value CONCLUDED \n", + "\n", + " 1 \\\n", + "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", + "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", + "cpf 05264105611 \n", + "customer_name BRUNO \n", + "delivery_address_city SAO PAULO \n", + "delivery_address_country BR \n", + "delivery_address_district PENHA DE FRANCA \n", + "delivery_address_external_id 7632090 \n", + "delivery_address_latitude -46.54 \n", + "delivery_address_longitude -23.52 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 36100 \n", + "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", + "merchant_latitude -46.54 \n", + "merchant_longitude -23.52 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2019-01-03 18:12:24 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 17.5 \n", + "origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-06T14:31:43.348Z \n", + "consumer_active true \n", + "consumer_customer_name BRUNO \n", + "consumer_customer_phone_area 60 \n", + "consumer_customer_phone_number 109441873 \n", + "restaurant_created_at 2017-01-20T13:14:16.179Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 20 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 0.0 \n", + "restaurant_merchant_zip_code 36350 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-03 20:15:06 \n", + "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", + "status_value CONCLUDED \n", + "\n", + " 2 \n", + "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", + "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", + "cpf 78163676650 \n", + "customer_name BRUNO \n", + "delivery_address_city SAO PAULO \n", + "delivery_address_country BR \n", + "delivery_address_district PENHA DE FRANCA \n", + "delivery_address_external_id 7632090 \n", + "delivery_address_latitude -46.54 \n", + "delivery_address_longitude -23.52 \n", + "delivery_address_state SP \n", + "delivery_address_zip_code 36100 \n", + "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", + "merchant_latitude -46.54 \n", + "merchant_longitude -23.52 \n", + "merchant_timezone America/Sao_Paulo \n", + "order_created_at 2018-12-04 18:12:24 \n", + "order_scheduled False \n", + "order_scheduled_date NaT \n", + "order_total_amount 17.5 \n", + "origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-06T14:31:43.348Z \n", + "consumer_active true \n", + "consumer_customer_name BRUNO \n", + "consumer_customer_phone_area 60 \n", + "consumer_customer_phone_number 109441873 \n", + "restaurant_created_at 2017-01-20T13:14:16.179Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 20 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 0.0 \n", + "restaurant_merchant_zip_code 36350 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-03 20:15:06 \n", + "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", + "status_value CONCLUDED " ] }, - "execution_count": 78, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "extract_latest_value(df=s_df, id_col='order_id', dt_col='created_at').groupBy('value').count().toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], - "source": [ - "tmp = extract_latest_value(df=s_df, id_col='order_id', dt_col='created_at')" + "tmp = (o_df\n", + " .join(c_df, on='customer_id', how='left')\n", + " .join(r_df, on='merchant_id', how='left')\n", + " .join(s_df, on='order_id', how='left')\n", + " )\n", + "tmp.limit(3).toPandas().T" ] }, { @@ -1717,58 +1312,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "win" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------+\n", - "|sum(count)|\n", - "+----------+\n", - "| 2441067|\n", - "+----------+\n", - "\n" - ] - } - ], - "source": [ - "\n", - "\n", - "import pyspark.sql.functions as f\n", - "tmp.groupBy('order_id')\\\n", - " .count()\\\n", - " .where(f.col('count') == 1)\\\n", - " .select(f.sum('count'))\\\n", - " .show()" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2441067" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tmp.count()" - ] + "source": [] }, { "cell_type": "code", @@ -1786,90 +1330,13 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "sensitive_data_columns = [\n", - " 'cpf',\n", - " 'customer_name',\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "for dtype, cols in dtypes.items():\n", - " for col in cols:\n", - " df = df.withColumn(col, df[col].cast(dtype))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "root\n", - " |-- cpf: long (nullable = true)\n", - " |-- customer_id: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- delivery_address_city: string (nullable = true)\n", - " |-- delivery_address_country: string (nullable = true)\n", - " |-- delivery_address_district: string (nullable = true)\n", - " |-- delivery_address_external_id: string (nullable = true)\n", - " |-- delivery_address_latitude: float (nullable = true)\n", - " |-- delivery_address_longitude: float (nullable = true)\n", - " |-- delivery_address_state: string (nullable = true)\n", - " |-- delivery_address_zip_code: long (nullable = true)\n", - " |-- items: string (nullable = true)\n", - " |-- merchant_id: string (nullable = true)\n", - " |-- merchant_latitude: float (nullable = true)\n", - " |-- merchant_longitude: float (nullable = true)\n", - " |-- merchant_timezone: string (nullable = true)\n", - " |-- order_created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- order_scheduled: boolean (nullable = true)\n", - " |-- order_scheduled_date: timestamp (nullable = true)\n", - " |-- order_total_amount: float (nullable = true)\n", - " |-- origin_platform: string (nullable = true)\n", - "\n" - ] - } - ], - "source": [ - "df.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "TypeError", - "evalue": "from_json() missing 2 required positional arguments: 'col' and 'schema'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m\u001b[0m", - "\u001b[0;31mTypeError\u001b[0mTraceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfrom_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m: from_json() missing 2 required positional arguments: 'col' and 'schema'" - ] - } - ], - "source": [ - "from_json()" + "# for dtype, cols in dtypes.items():\n", + "# for col in cols:\n", + "# df = df.withColumn(col, df[col].cast(dtype))" ] } ], diff --git a/dev/docker-volume/src/DataProcessor.py b/dev/docker-volume/src/DataProcessor.py index ae316a6..48b9f38 100644 --- a/dev/docker-volume/src/DataProcessor.py +++ b/dev/docker-volume/src/DataProcessor.py @@ -1,6 +1,7 @@ -from src.config import TRUSTED_DATA_PATH +from src.config import TRUSTED_DATA_PATH, RAW_DATA_PATH from pyspark.sql import Window from pyspark.sql.dataframe import DataFrame +from pyspark.sql.session import SparkSession from pyspark.sql.types import ArrayType, StructField, StructType, StringType, FloatType from pyspark.sql.functions import from_json, explode, flatten, col, rank, col, monotonically_increasing_id, desc @@ -36,12 +37,14 @@ def fix_order_schema(df): return df -def create_order_items(df:DataFrame): +def create_trusted_order_items(spark:SparkSession): """ Creates requested Order Items table based on raw Orders `df`. """ print('Starting processing to generate Order Items dataset...') + + df = spark.read.parquet(str(RAW_DATA_PATH / 'order')) schema = ArrayType(StructType([ StructField("name", StringType(), True), @@ -62,6 +65,7 @@ def create_order_items(df:DataFrame): df = df.withColumn("items", from_json(df["items"], schema)) tmp = df.select('order_id', explode(df['items']).alias('items')) tmp = tmp.select('order_id', 'items.*') + tmp = tmp.dropDuplicates() output_path = TRUSTED_DATA_PATH / 'order_items' print(f'Exporting dataset file system...') @@ -84,4 +88,57 @@ def extract_latest_values(df:DataFrame, id_col:str, dt_col:str): .withColumn('rank', rank().over(window)) \ .filter(col('rank') == 1).drop('rank','tiebreak') - return df \ No newline at end of file + return df + +def load_sanitized_dataframe(table:str, spark:SparkSession): + df = spark.read.parquet(str(RAW_DATA_PATH / table)) + df = add_prefix(df, table) + + if table == 'restaurant': + df = df.withColumnRenamed('id', 'merchant_id') + + if table == 'status': + df = extract_latest_values(df=df, id_col='order_id', dt_col='status_created_at') + + return df + +def create_trusted_order(spark:SparkSession): + + print('Starting processing to generate Order Items dataset...') + + o_df = load_sanitized_dataframe('order', spark) + c_df = load_sanitized_dataframe('consumer', spark) + r_df = load_sanitized_dataframe('restaurant', spark) + s_df = load_sanitized_dataframe('status', spark) + + tmp = (o_df + .join(c_df, on='customer_id', how='left') + .join(r_df, on='merchant_id', how='left') + .join(s_df, on='order_id', how='left') + .dropDuplicates() + ) + + print(f'Exporting dataset file system...') + + output_path = TRUSTED_DATA_PATH / 'order' + + # anonymize sensitive data by dropping columns + sensitive_data_columns = ['cpf', 'customer_name', 'consumer_customer_name', 'consumer_customer_phone_number'] + tmp = tmp.drop(*sensitive_data_columns) + + tmp.write.parquet(str(output_path)) + + print(f'Dataset sucessfully exported to `{output_path}`!') + + return tmp + +def add_prefix(df:DataFrame, prefix:str, skip_ids:bool=True): + + for col in df.columns: + + if col.endswith('id'): + continue + + df = df.withColumnRenamed(col, f'{prefix}_{col}') + + return df diff --git a/dev/docker-volume/src/IOController.py b/dev/docker-volume/src/IOController.py index f958f4d..ee958c9 100644 --- a/dev/docker-volume/src/IOController.py +++ b/dev/docker-volume/src/IOController.py @@ -18,8 +18,8 @@ def create_pyspark_session(app_name='my_app', aws_secrets:dict=aws_secrets) -> S spark = pyspark.sql.SparkSession.builder \ .appName(app_name) \ .config("spark.memory.fraction", 0.8) \ - .config("spark.executor.memory", "12g") \ - .config("spark.driver.memory", "15g") \ + .config("spark.executor.memory", "8g") \ + .config("spark.driver.memory", "8g") \ .config("spark.sql.shuffle.partitions" , "800") \ .config('spark.sql.codegen.wholeStage', False) \ .getOrCreate() diff --git a/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc b/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc index a8c542d4898f74ba89e03d81ae42281145a38a20..fbcb3a0556ca9cc3f073cfba2f6ab1364f082758 100644 GIT binary patch literal 4302 zcmZ`+TW=f372ch_aCs3$>K0jcoOM&AW>Cm&P^2&EMTz1jMXR`$(iCh`YQFzyeu+!|8-iQhPOZ%D+8OIIo7yaOGPmm*-ATR7@A^h}(@GX} z1EYIsHLG=NS-o4&8r?=V*PY9n-KH7y)A_8`ZDs9lJ6q^3WQ*O!Y^l3s?#tcfY^A$m zTd~h8Pi-FX>gR6vmDrD0dF?6Vb>2Yx2A|{2&#mrS9PoL}YVkIDZ(`Q1_!eJy>hMLr zgq~N?4)`+Gt?*aSzCD@0%5R|c8h_2K2>DH{xW!-n-0R-qx3{fM_!TU-I!w2B9)7&N z^ZsUUb7N)T3<`s#iXr(D;{h8Z9|$@h(3y`L75BDE9f>Od4(?@$)GuEbAyx0i73@>uqt*_!b|-zF<3 zKho*o-@_$C2!(?mTNn1FrP$aybk6MoJ4XPZ&%LzZBzC%93-NpJAnqT@P##3XIE->0 z%Kkx|MPV@rr~WV#PZ#gLE{bFM!|-+Nc$DR`6aEaI@rE#3X|G?5aC1_aRGCcBbu+eY zo+uWvt!I{4!|nNN*#9tABaw$Myb|v6!ESh*sDrRy2od*HJoVmQ zB+)4Hu-uq=K*!NuqT)>I8ZCOyI8nR(NUDx!>=3uHP3@UA7${O{Lc+ML6gO!?ZIC?a zmB-XVTauNQ+p+_Djfw9=zZM$glWAzYfd;oZ;|_PZx9@S}=|1DZQ+8%61jpDuhwaAh z>XyF2<1~33i<4f&xxmTv5VMIIA$-@qJyhiT*(Nl$r z(q5F6;nN=uzT>s0$*|W))*z?70VS69_mX{tUB@;Ci&tUS%Wi6so62{T&Z5YZs}PpU zEN0`f*w>8oZ`gnQpvJ^H#*qA;q~yQmMe-ofwv48CkZzP<=QbtOp?$%Wqg>8zTIcqq z4NrQ?R~7E4;N0TwxdYvs(ft{{qN>w5!Hiy=nNyq5>oa;|MxUF}n=|_SjNZD^_vg;B z_cMl!sPVR{k8R$*VT}P0h z;g4nG59hE={usOD3!D73GIqbT_$z+{2*FGFI#B>)JDs zq1_z7vZDh8c%lIHI^c;cp)7bbLJTMBLEK2()#)X3fUXv$qX9h!`-yvj$(V-j>xDG~Ol@Qp9Rq!`F zwfmgy-svoeWqO~B6XCjo_=m-SbE$-IFd!-<2!RE`ji4Gt)JcRS8W7r-aS9X^bJXyn zG>N2ca#0L7N5d3!hLaYAM%p_TFb1G^kJgq5UZ$vaR5pZ#BQ+eU-Y`-JFK6`{(V>ca z;9v@vNrD^S+{M%6ParHWU`x!k8~^YBub{PJuVFN>0~W9r{$dO3o;j1XF8%uyVh+3Fw;Q&0`k zEPPCX`XaGZlA$-shSP_XX78E@=s+d$9-Mw8h;6>2QlnEB7RO}tOyJ_*{ht^u!lLlJ}Gbo)g$@h~Ph>YaJ%3m=HntxGI!PuYDPiyRhe%cfI@o+E;TjEF9t@ts74*EqdM>rSp zDfRWJuYQH$rshuLfkF^5u!D0@9E(5EJUTS7LE=qXF*jX7C!klZ9@@L!fBW10!-FD^ zd-*8a!xrA92NJS$ss@jZ&$WA4B)RyI#t1*fA4vR~#562AfjMa8S&t7;I~pmBl#qgg zTl^M6ugEx;IP%A3Wli`-#r(_BvGiRyo8STREeH#!S+mU_IBP1YZ+uERq@C}W`+sW{ zV5`$W1Q~o4Shdr+0@z9H2J#4{UBczJAeQ)s9ARFEB8v+5H%!?<0BehtM?N4H5i_=N zy10L7{f!dRqJEsQvn42wwcW!zsp!)|oO4OJBKFX{0v-unC=&A|s75RWmD(bL_zNy+ zzPVWqMl-g`O#fEL-syZ#*YNeSj$<7p`{n7F1g#HGOcA**AEhN$F3?DfuP9REf*E%i$C=NQ;GL}Cqcr|GF@>c5)`W*|*>|00 RrBw@>%^xf@TaD)J{{gw2aB2Vm delta 1403 zcmZ8gOK;mo5Z>iOe2EhEup-&%Lr#HKXyMeSwm{IvsS`9PPzQ~DAqpsjcIB|;TFP=2 z$AJtTlz+fuT);l|+Wvzc+FO4Bft-2@dg&i1&>6}xh>+m$o5wC@=9~Ge|6gZu;QKCt z=d+s!!&8rtzfmba1}L}TOMeBy2%`xRG^7-Q-7qze^>fxd;f-n$`uu=S*VNjs5Qr9`4`W9q#Vz z?+hR8?0++$*&gj=GkQq<>=k`wHz4vCBD>k|+7}%gloZP+MwiNUbZN4gHS|C4H$jot zLF6V=CsUpqqe+rVAI;q>(Zc|D96uYTqsLrCbAEI2v%c}~$D3f08z|m7 z8{pZAWL#d}(93R_{lzB`I-(%nJeVY=^fi3x``}T)(-JsRRB1|QnlhGz=}R53QNm^k zSCn}bW0kO7ig60OSY70n99{`mOV}^rS_#+R;6r~2iJsCqIj^ulS#!dI9+}gncA=dT zu-dGloRxJ+&s`Q=(0kz0<{G+K6S4AGYlEEXP<0!to@+l=&wW*!>q~lpT_9|2uD>L# z^D|h`4`XpH`=fH7KF7aNQ1#VDYNqNFYO2QJsSVT!2rBW;Kv5XbZt!X5m%(%++RLOxMkuW$^Oh&FNG0nZn zOigEMIE~a}*}Vz=)j$+;)Er43DSkN)aG^*)goyN05X7`KBinY`*;h^wXtox>D|f*$ z&}H+t$l0Ka8fdR^Xa=6t;g@@mkBU`1Il6O_Tg8;!IgtUlRz=~oN<5Vtlc!wfm2sRX zE{h@04JIek%Od1mjJ$`DmE-t`O^)B5<2HZRQZgE;VFD$jYWOrtX8hZ%?!C9zgLB^v zHgY$Wqpi{8XdEA|+U8A9Qb_Tqt@KH<%6P40*U2#^>RS|0{OW(ofi Wo1G%dRf_L*`jK9<0@tq9KKLIUlRdlu diff --git a/dev/docker-volume/src/__pycache__/IOController.cpython-38.pyc b/dev/docker-volume/src/__pycache__/IOController.cpython-38.pyc index 90d91c76983b0775568e3827d2019cf4b7163a55..5ec9067a6fbf770b3a8b6abaf651292c195dcab6 100644 GIT binary patch delta 154 zcmew__*0NKl$V!_0SHp6LgNcJ^4c>BvMyv=z?H(D!U3i^Q@AF_GRj(Vr|>LfVq~ac zUBI8hTf?|OAcb!sBV2?ZEFu6ADPc$vY-R+iWe#T06q>x5ah(K{MS7KRaY15Hwq8n6 lW?5>HUT$h`eo^J-dZu<(##@uwIl>qXC&zK50m-8rRsi97DZ&5% delta 160 zcmew<_+OAWl$V!_0SNd6LgI@z^4c>BvM*#>z?H(0!U?9iQn)9_GRj)=r0_0eVq~ac zUBI8hSHrkKAccP+BV0rPEFuUIDPc$vYGwqgWe#T06rQ}9ah)8qp;3C3aB)FmQMO)6 rQD#|ckzQ_UZhlc^6tkge`sPZec2>ry$&4Ifj0Te