diff --git a/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb b/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb index 3094430..ba8d5dd 100644 --- a/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb +++ b/dev/docker-volume/notebooks/03-create-trusted-data-order-items.ipynb @@ -67,55 +67,6 @@ "spark = create_pyspark_session()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Loads raw `Order` data." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (3683040, 22)\n", - "root\n", - " |-- cpf: string (nullable = true)\n", - " |-- customer_id: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- delivery_address_city: string (nullable = true)\n", - " |-- delivery_address_country: string (nullable = true)\n", - " |-- delivery_address_district: string (nullable = true)\n", - " |-- delivery_address_external_id: string (nullable = true)\n", - " |-- delivery_address_latitude: string (nullable = true)\n", - " |-- delivery_address_longitude: string (nullable = true)\n", - " |-- delivery_address_state: string (nullable = true)\n", - " |-- delivery_address_zip_code: string (nullable = true)\n", - " |-- items: string (nullable = true)\n", - " |-- merchant_id: string (nullable = true)\n", - " |-- merchant_latitude: string (nullable = true)\n", - " |-- merchant_longitude: string (nullable = true)\n", - " |-- merchant_timezone: string (nullable = true)\n", - " |-- order_created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- order_scheduled: boolean (nullable = true)\n", - " |-- order_scheduled_date: timestamp (nullable = true)\n", - " |-- order_total_amount: double (nullable = true)\n", - " |-- origin_platform: string (nullable = true)\n", - "\n" - ] - } - ], - "source": [ - "df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n", - "explore_dataframe(df)" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb b/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb deleted file mode 100644 index 1bf1515..0000000 --- a/dev/docker-volume/notebooks/04-create-trusted-data-order-Copy1.ipynb +++ /dev/null @@ -1,1421 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "from src.config import RAW_DATA_PATH\n", - "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n", - "from src.IOController import create_pyspark_session" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting PySpark session. Check your terminal for detailed logging...\n", - "PySpark session sucessfully created.\n" - ] - } - ], - "source": [ - "spark = create_pyspark_session()" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting processing to generate Order Items dataset...\n", - "Exporting dataset file system...\n", - "Dataset sucessfully exported to `/home/jovyan/data/trusted/order`!\n" - ] - } - ], - "source": [ - "tmp = create_trusted_order(spark)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "pyspark.sql.session.SparkSession" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(spark)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tmp.limit(3).toPandas().T" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (3683040, 22)\n", - "root\n", - " |-- cpf: string (nullable = true)\n", - " |-- customer_id: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- delivery_address_city: string (nullable = true)\n", - " |-- delivery_address_country: string (nullable = true)\n", - " |-- delivery_address_district: string (nullable = true)\n", - " |-- delivery_address_external_id: string (nullable = true)\n", - " |-- delivery_address_latitude: string (nullable = true)\n", - " |-- delivery_address_longitude: string (nullable = true)\n", - " |-- delivery_address_state: string (nullable = true)\n", - " |-- delivery_address_zip_code: string (nullable = true)\n", - " |-- items: string (nullable = true)\n", - " |-- merchant_id: string (nullable = true)\n", - " |-- merchant_latitude: string (nullable = true)\n", - " |-- merchant_longitude: string (nullable = true)\n", - " |-- merchant_timezone: string (nullable = true)\n", - " |-- order_created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- order_scheduled: boolean (nullable = true)\n", - " |-- order_scheduled_date: timestamp (nullable = true)\n", - " |-- order_total_amount: double (nullable = true)\n", - " |-- origin_platform: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
cpf805321017634335210396138650991217
customer_id977b9a89-825f-464b-8ef6-0f453d7334c1e969cc0d-388b-4025-9351-0db0f718d81ce08dcc8b-f998-405e-b3f2-7107ea8958cf
customer_nameGUSTAVOMICHELLEVICTOR
delivery_address_cityFRANCASANTOSGUARULHOS
delivery_address_countryBRBRBR
delivery_address_districtJARDIM ESPRAIADOCAMPO GRANDEJARDIM ROSSI
delivery_address_external_id673665587592168765930
delivery_address_latitude-47.39-46.34-46.53
delivery_address_longitude-20.55-23.96-23.44
delivery_address_stateSPSPSP
delivery_address_zip_code144031107071304
items[{\"name\": \"Parmegiana de Filé de Frango (2 pes...[{\"name\": \"Filé Mignon à Cubana\", \"addition\": ...[{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va...
merchant_ideb4197f9-964c-4f87-8307-709e498aab87927d46f9-4bb3-48f7-be1d-584deaf18adc71ad62c5-5947-4518-9846-976fbdd2f881
merchant_latitude-47.39-46.34-46.53
merchant_longitude-20.55-23.96-23.44
merchant_timezoneAmerica/Sao_PauloAmerica/Sao_PauloAmerica/Sao_Paulo
order_created_at2019-01-17 22:50:062019-01-17 17:51:262019-01-17 22:53:47
order_iddd4f8f0a-c2cb-45c6-a002-c3be6b305e5f8dd80f0b-db00-4b88-b7e2-02ca706fc5a5430f9887-a563-45ee-8001-1cb29597d9dd
order_scheduledFalseFalseFalse
order_scheduled_dateNaTNaTNaT
order_total_amount46104.535
origin_platformANDROIDANDROIDIOS
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "cpf 80532101763 \n", - "customer_id 977b9a89-825f-464b-8ef6-0f453d7334c1 \n", - "customer_name GUSTAVO \n", - "delivery_address_city FRANCA \n", - "delivery_address_country BR \n", - "delivery_address_district JARDIM ESPRAIADO \n", - "delivery_address_external_id 6736655 \n", - "delivery_address_latitude -47.39 \n", - "delivery_address_longitude -20.55 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 14403 \n", - "items [{\"name\": \"Parmegiana de Filé de Frango (2 pes... \n", - "merchant_id eb4197f9-964c-4f87-8307-709e498aab87 \n", - "merchant_latitude -47.39 \n", - "merchant_longitude -20.55 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 22:50:06 \n", - "order_id dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 46 \n", - "origin_platform ANDROID \n", - "\n", - " 1 \\\n", - "cpf 43352103961 \n", - "customer_id e969cc0d-388b-4025-9351-0db0f718d81c \n", - "customer_name MICHELLE \n", - "delivery_address_city SANTOS \n", - "delivery_address_country BR \n", - "delivery_address_district CAMPO GRANDE \n", - "delivery_address_external_id 8759216 \n", - "delivery_address_latitude -46.34 \n", - "delivery_address_longitude -23.96 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 11070 \n", - "items [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... \n", - "merchant_id 927d46f9-4bb3-48f7-be1d-584deaf18adc \n", - "merchant_latitude -46.34 \n", - "merchant_longitude -23.96 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 17:51:26 \n", - "order_id 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 104.5 \n", - "origin_platform ANDROID \n", - "\n", - " 2 \n", - "cpf 38650991217 \n", - "customer_id e08dcc8b-f998-405e-b3f2-7107ea8958cf \n", - "customer_name VICTOR \n", - "delivery_address_city GUARULHOS \n", - "delivery_address_country BR \n", - "delivery_address_district JARDIM ROSSI \n", - "delivery_address_external_id 8765930 \n", - "delivery_address_latitude -46.53 \n", - "delivery_address_longitude -23.44 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 71304 \n", - "items [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... \n", - "merchant_id 71ad62c5-5947-4518-9846-976fbdd2f881 \n", - "merchant_latitude -46.53 \n", - "merchant_longitude -23.44 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 22:53:47 \n", - "order_id 430f9887-a563-45ee-8001-1cb29597d9dd \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 35 \n", - "origin_platform IOS " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "o_df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n", - "explore_dataframe(o_df)\n", - "o_df.limit(3).toPandas().T" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (809323, 7)\n", - "root\n", - " |-- customer_id: string (nullable = true)\n", - " |-- language: string (nullable = true)\n", - " |-- created_at: string (nullable = true)\n", - " |-- active: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- customer_phone_area: string (nullable = true)\n", - " |-- customer_phone_number: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
customer_id00039466-560f-4e57-85a2-d4753cd901be001a1267-31a3-4f5b-a028-d7e323864b08003ae1d5-67b8-4a04-b055-0e4e9622771a
consumer_languagept-brpt-brpt-br
consumer_created_at2018-04-05T14:49:18.165Z2018-01-14T21:40:02.141Z2018-01-07T03:47:15.554Z
consumer_activetruetruetrue
consumer_customer_nameNUNOADRIELLYPAULA
consumer_customer_phone_area465962
consumer_customer_phone_number816135924231330577347597883
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "customer_id 00039466-560f-4e57-85a2-d4753cd901be \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-04-05T14:49:18.165Z \n", - "consumer_active true \n", - "consumer_customer_name NUNO \n", - "consumer_customer_phone_area 46 \n", - "consumer_customer_phone_number 816135924 \n", - "\n", - " 1 \\\n", - "customer_id 001a1267-31a3-4f5b-a028-d7e323864b08 \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-14T21:40:02.141Z \n", - "consumer_active true \n", - "consumer_customer_name ADRIELLY \n", - "consumer_customer_phone_area 59 \n", - "consumer_customer_phone_number 231330577 \n", - "\n", - " 2 \n", - "customer_id 003ae1d5-67b8-4a04-b055-0e4e9622771a \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-07T03:47:15.554Z \n", - "consumer_active true \n", - "consumer_customer_name PAULA \n", - "consumer_customer_phone_area 62 \n", - "consumer_customer_phone_number 347597883 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table = 'consumer'\n", - "c_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", - "explore_dataframe(c_df)\n", - "c_df = add_prefix(c_df, table)\n", - "c_df.limit(3).toPandas().T" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (7292, 12)\n", - "root\n", - " |-- id: string (nullable = true)\n", - " |-- created_at: string (nullable = true)\n", - " |-- enabled: string (nullable = true)\n", - " |-- price_range: string (nullable = true)\n", - " |-- average_ticket: string (nullable = true)\n", - " |-- takeout_time: string (nullable = true)\n", - " |-- delivery_time: string (nullable = true)\n", - " |-- minimum_order_value: string (nullable = true)\n", - " |-- merchant_zip_code: string (nullable = true)\n", - " |-- merchant_city: string (nullable = true)\n", - " |-- merchant_state: string (nullable = true)\n", - " |-- merchant_country: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
merchant_id02c94103-61f3-4906-a4a9-55611db9f28c15e7f5fd-090d-47b9-9f14-b6f7fce3c95d33ca5d3d-b99f-404d-84d9-8df8f38a2261
restaurant_created_at2017-01-23T12:52:30.910Z2017-01-20T13:14:48.286Z2017-01-23T12:46:33.457Z
restaurant_enabledfalsetruetrue
restaurant_price_range335
restaurant_average_ticket60.060.0100.0
restaurant_takeout_time000
restaurant_delivery_time50045
restaurant_minimum_order_value30.030.010.0
restaurant_merchant_zip_code140255018023090
restaurant_merchant_cityRIBEIRAO PRETOSAO PAULORIO DE JANEIRO
restaurant_merchant_stateSPSPRJ
restaurant_merchant_countryBRBRBR
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "merchant_id 02c94103-61f3-4906-a4a9-55611db9f28c \n", - "restaurant_created_at 2017-01-23T12:52:30.910Z \n", - "restaurant_enabled false \n", - "restaurant_price_range 3 \n", - "restaurant_average_ticket 60.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 50 \n", - "restaurant_minimum_order_value 30.0 \n", - "restaurant_merchant_zip_code 14025 \n", - "restaurant_merchant_city RIBEIRAO PRETO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "\n", - " 1 \\\n", - "merchant_id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n", - "restaurant_created_at 2017-01-20T13:14:48.286Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 3 \n", - "restaurant_average_ticket 60.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 0 \n", - "restaurant_minimum_order_value 30.0 \n", - "restaurant_merchant_zip_code 50180 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "\n", - " 2 \n", - "merchant_id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n", - "restaurant_created_at 2017-01-23T12:46:33.457Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 5 \n", - "restaurant_average_ticket 100.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 45 \n", - "restaurant_minimum_order_value 10.0 \n", - "restaurant_merchant_zip_code 23090 \n", - "restaurant_merchant_city RIO DE JANEIRO \n", - "restaurant_merchant_state RJ \n", - "restaurant_merchant_country BR " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table = 'restaurant'\n", - "r_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", - "explore_dataframe(r_df)\n", - "r_df = r_df.withColumnRenamed('id', 'merchant_id')\n", - "r_df = add_prefix(r_df, table)\n", - "r_df.limit(3).toPandas().T" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (11075048, 4)\n", - "root\n", - " |-- created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- status_id: string (nullable = true)\n", - " |-- value: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-24 23:04:270002fe02-d7dc-4232-b7ac-3394019ce2407964bf63-007a-484d-a321-e9118ccc2f97REGISTERED
22019-01-24 23:04:280002fe02-d7dc-4232-b7ac-3394019ce240ca16b92b-db8f-4274-b165-929675541a9fPLACED
\n", - "
" - ], - "text/plain": [ - " status_created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "\n", - " status_id status_value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n", - "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table = 'status'\n", - "s_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", - "explore_dataframe(s_df)\n", - "s_df = add_prefix(s_df, table)\n", - "s_df.limit(3).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (2441067, 4)\n", - "root\n", - " |-- status_created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- status_id: string (nullable = true)\n", - " |-- status_value: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-03 20:15:060012d95c-9c4b-4244-86b5-dcf87677dcc1c8058ba4-1d1c-4a1f-8401-a7042ef6ba85CONCLUDED
22019-01-06 16:20:270013fc5c-4c10-4402-886c-1b8166e4632ed0a3ffd5-4e48-4cc4-9739-d5764678c19fCONCLUDED
32019-01-03 15:55:1200251da6-aa45-4512-be58-6622a248cdffa621de13-5272-4c7c-969c-3bcd53f0515fCONCLUDED
42019-01-21 22:20:0200273652-efc8-4e7c-95b2-cd3827900e7e74f8ff33-e731-477f-a698-4755577b80a1CONCLUDED
\n", - "
" - ], - "text/plain": [ - " status_created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n", - "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n", - "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n", - "\n", - " status_id status_value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n", - "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n", - "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n", - "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_df = extract_latest_values(df=s_df, id_col='order_id', dt_col='status_created_at')\n", - "explore_dataframe(s_df)\n", - "s_df.limit(5).toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-------+\n", - "|status_value| count|\n", - "+------------+-------+\n", - "| CONCLUDED|2354218|\n", - "| CANCELLED| 55179|\n", - "| PLACED| 31654|\n", - "| REGISTERED| 16|\n", - "+------------+-------+\n", - "\n" - ] - } - ], - "source": [ - "s_df.groupBy('status_value').count().sort('count', ascending=False).show()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
order_id0002fe02-d7dc-4232-b7ac-3394019ce2400012d95c-9c4b-4244-86b5-dcf87677dcc10012d95c-9c4b-4244-86b5-dcf87677dcc1
merchant_ide3d24e4d-2f51-4987-8c39-47923b20b9bee66b0dcc-ffa9-42ee-a864-60977672c3efe66b0dcc-ffa9-42ee-a864-60977672c3ef
customer_id97c53c25-bd9b-41cb-8a0d-13cd74509f17e2213649-cede-4770-b6e0-7ac1dd4d3548e2213649-cede-4770-b6e0-7ac1dd4d3548
cpf168541854920526410561178163676650
customer_nameEDUARDOBRUNOBRUNO
delivery_address_citySAO PAULOSAO PAULOSAO PAULO
delivery_address_countryBRBRBR
delivery_address_districtITAIM BIBIPENHA DE FRANCAPENHA DE FRANCA
delivery_address_external_id884712276320907632090
delivery_address_latitude-46.68-46.54-46.54
delivery_address_longitude-23.59-23.52-23.52
delivery_address_stateSPSPSP
delivery_address_zip_code453813610036100
items[{\"name\": \"Pastel Frangolino\", \"addition\": {\"v...[{\"name\": \"Porção Batata Frita\", \"addition\": {...[{\"name\": \"Porção Batata Frita\", \"addition\": {...
merchant_latitude-46.68-46.54-46.54
merchant_longitude-23.59-23.52-23.52
merchant_timezoneAmerica/Sao_PauloAmerica/Sao_PauloAmerica/Sao_Paulo
order_created_at2019-01-24 23:04:272019-01-03 18:12:242018-12-04 18:12:24
order_scheduledFalseFalseFalse
order_scheduled_dateNaTNaTNaT
order_total_amount2717.517.5
origin_platformANDROIDANDROIDANDROID
consumer_languagept-brpt-brpt-br
consumer_created_at2018-04-05T13:20:39.644Z2018-01-06T14:31:43.348Z2018-01-06T14:31:43.348Z
consumer_activetruetruetrue
consumer_customer_nameEDUARDOBRUNOBRUNO
consumer_customer_phone_area836060
consumer_customer_phone_number020082840109441873109441873
restaurant_created_at2017-01-20T13:14:41.451Z2017-01-20T13:14:16.179Z2017-01-20T13:14:16.179Z
restaurant_enabledtruetruetrue
restaurant_price_range111
restaurant_average_ticket30.030.030.0
restaurant_takeout_time02020
restaurant_delivery_time4000
restaurant_minimum_order_value30.00.00.0
restaurant_merchant_zip_code565603635036350
restaurant_merchant_citySAO PAULOSAO PAULOSAO PAULO
restaurant_merchant_stateSPSPSP
restaurant_merchant_countryBRBRBR
status_created_at2019-01-25 01:05:072019-01-03 20:15:062019-01-03 20:15:06
status_idb4298862-fa38-499a-93e2-a76930fb2bcec8058ba4-1d1c-4a1f-8401-a7042ef6ba85c8058ba4-1d1c-4a1f-8401-a7042ef6ba85
status_valueCONCLUDEDCONCLUDEDCONCLUDED
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n", - "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n", - "cpf 16854185492 \n", - "customer_name EDUARDO \n", - "delivery_address_city SAO PAULO \n", - "delivery_address_country BR \n", - "delivery_address_district ITAIM BIBI \n", - "delivery_address_external_id 8847122 \n", - "delivery_address_latitude -46.68 \n", - "delivery_address_longitude -23.59 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 45381 \n", - "items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n", - "merchant_latitude -46.68 \n", - "merchant_longitude -23.59 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-24 23:04:27 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 27 \n", - "origin_platform ANDROID \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-04-05T13:20:39.644Z \n", - "consumer_active true \n", - "consumer_customer_name EDUARDO \n", - "consumer_customer_phone_area 83 \n", - "consumer_customer_phone_number 020082840 \n", - "restaurant_created_at 2017-01-20T13:14:41.451Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 1 \n", - "restaurant_average_ticket 30.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 40 \n", - "restaurant_minimum_order_value 30.0 \n", - "restaurant_merchant_zip_code 56560 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "status_created_at 2019-01-25 01:05:07 \n", - "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n", - "status_value CONCLUDED \n", - "\n", - " 1 \\\n", - "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", - "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", - "cpf 05264105611 \n", - "customer_name BRUNO \n", - "delivery_address_city SAO PAULO \n", - "delivery_address_country BR \n", - "delivery_address_district PENHA DE FRANCA \n", - "delivery_address_external_id 7632090 \n", - "delivery_address_latitude -46.54 \n", - "delivery_address_longitude -23.52 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 36100 \n", - "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", - "merchant_latitude -46.54 \n", - "merchant_longitude -23.52 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-03 18:12:24 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 17.5 \n", - "origin_platform ANDROID \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-06T14:31:43.348Z \n", - "consumer_active true \n", - "consumer_customer_name BRUNO \n", - "consumer_customer_phone_area 60 \n", - "consumer_customer_phone_number 109441873 \n", - "restaurant_created_at 2017-01-20T13:14:16.179Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 1 \n", - "restaurant_average_ticket 30.0 \n", - "restaurant_takeout_time 20 \n", - "restaurant_delivery_time 0 \n", - "restaurant_minimum_order_value 0.0 \n", - "restaurant_merchant_zip_code 36350 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "status_created_at 2019-01-03 20:15:06 \n", - "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", - "status_value CONCLUDED \n", - "\n", - " 2 \n", - "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", - "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", - "cpf 78163676650 \n", - "customer_name BRUNO \n", - "delivery_address_city SAO PAULO \n", - "delivery_address_country BR \n", - "delivery_address_district PENHA DE FRANCA \n", - "delivery_address_external_id 7632090 \n", - "delivery_address_latitude -46.54 \n", - "delivery_address_longitude -23.52 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 36100 \n", - "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", - "merchant_latitude -46.54 \n", - "merchant_longitude -23.52 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2018-12-04 18:12:24 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 17.5 \n", - "origin_platform ANDROID \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-06T14:31:43.348Z \n", - "consumer_active true \n", - "consumer_customer_name BRUNO \n", - "consumer_customer_phone_area 60 \n", - "consumer_customer_phone_number 109441873 \n", - "restaurant_created_at 2017-01-20T13:14:16.179Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 1 \n", - "restaurant_average_ticket 30.0 \n", - "restaurant_takeout_time 20 \n", - "restaurant_delivery_time 0 \n", - "restaurant_minimum_order_value 0.0 \n", - "restaurant_merchant_zip_code 36350 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "status_created_at 2019-01-03 20:15:06 \n", - "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", - "status_value CONCLUDED " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tmp = (o_df\n", - " .join(c_df, on='customer_id', how='left')\n", - " .join(r_df, on='merchant_id', how='left')\n", - " .join(s_df, on='order_id', how='left')\n", - " )\n", - "tmp.limit(3).toPandas().T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Fix schema and anonymize sensitive data" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# for dtype, cols in dtypes.items():\n", - "# for col in cols:\n", - "# df = df.withColumn(col, df[col].cast(dtype))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb b/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb index 31255a6..0d3bbc4 100644 --- a/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb +++ b/dev/docker-volume/notebooks/04-create-trusted-data-order.ipynb @@ -1,879 +1,112 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "from src.config import RAW_DATA_PATH\n", - "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n", - "from src.IOController import create_pyspark_session" + "# Truted Data Generation - Order\n", + "\n", + "## Scope of notebook\n", + "\n", + "> Create `Order` dataset following the requirements below.\n", + "\n", + "* Order dataset - one line per order with all data from order, consumer, restaurant and the LAST status from order statuses dataset. To help analysis, it would be a nice to have: data partitioned on the restaurant LOCAL date." ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting PySpark session. Check your terminal for detailed logging...\n", - "PySpark session sucessfully created.\n" - ] - } - ], "source": [ - "spark = create_pyspark_session()" + "Loads libraries" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(#rows, #columns) = (3683040, 22)\n", - "root\n", - " |-- cpf: string (nullable = true)\n", - " |-- customer_id: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- delivery_address_city: string (nullable = true)\n", - " |-- delivery_address_country: string (nullable = true)\n", - " |-- delivery_address_district: string (nullable = true)\n", - " |-- delivery_address_external_id: string (nullable = true)\n", - " |-- delivery_address_latitude: string (nullable = true)\n", - " |-- delivery_address_longitude: string (nullable = true)\n", - " |-- delivery_address_state: string (nullable = true)\n", - " |-- delivery_address_zip_code: string (nullable = true)\n", - " |-- items: string (nullable = true)\n", - " |-- merchant_id: string (nullable = true)\n", - " |-- merchant_latitude: string (nullable = true)\n", - " |-- merchant_longitude: string (nullable = true)\n", - " |-- merchant_timezone: string (nullable = true)\n", - " |-- order_created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- order_scheduled: boolean (nullable = true)\n", - " |-- order_scheduled_date: timestamp (nullable = true)\n", - " |-- order_total_amount: double (nullable = true)\n", - " |-- origin_platform: string (nullable = true)\n", - "\n" + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
cpf805321017634335210396138650991217
customer_id977b9a89-825f-464b-8ef6-0f453d7334c1e969cc0d-388b-4025-9351-0db0f718d81ce08dcc8b-f998-405e-b3f2-7107ea8958cf
customer_nameGUSTAVOMICHELLEVICTOR
delivery_address_cityFRANCASANTOSGUARULHOS
delivery_address_countryBRBRBR
delivery_address_districtJARDIM ESPRAIADOCAMPO GRANDEJARDIM ROSSI
delivery_address_external_id673665587592168765930
delivery_address_latitude-47.39-46.34-46.53
delivery_address_longitude-20.55-23.96-23.44
delivery_address_stateSPSPSP
delivery_address_zip_code144031107071304
items[{\"name\": \"Parmegiana de Filé de Frango (2 pes...[{\"name\": \"Filé Mignon à Cubana\", \"addition\": ...[{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va...
merchant_ideb4197f9-964c-4f87-8307-709e498aab87927d46f9-4bb3-48f7-be1d-584deaf18adc71ad62c5-5947-4518-9846-976fbdd2f881
merchant_latitude-47.39-46.34-46.53
merchant_longitude-20.55-23.96-23.44
merchant_timezoneAmerica/Sao_PauloAmerica/Sao_PauloAmerica/Sao_Paulo
order_created_at2019-01-17 22:50:062019-01-17 17:51:262019-01-17 22:53:47
order_iddd4f8f0a-c2cb-45c6-a002-c3be6b305e5f8dd80f0b-db00-4b88-b7e2-02ca706fc5a5430f9887-a563-45ee-8001-1cb29597d9dd
order_scheduledFalseFalseFalse
order_scheduled_dateNaTNaTNaT
order_total_amount46104.535
origin_platformANDROIDANDROIDIOS
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "cpf 80532101763 \n", - "customer_id 977b9a89-825f-464b-8ef6-0f453d7334c1 \n", - "customer_name GUSTAVO \n", - "delivery_address_city FRANCA \n", - "delivery_address_country BR \n", - "delivery_address_district JARDIM ESPRAIADO \n", - "delivery_address_external_id 6736655 \n", - "delivery_address_latitude -47.39 \n", - "delivery_address_longitude -20.55 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 14403 \n", - "items [{\"name\": \"Parmegiana de Filé de Frango (2 pes... \n", - "merchant_id eb4197f9-964c-4f87-8307-709e498aab87 \n", - "merchant_latitude -47.39 \n", - "merchant_longitude -20.55 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 22:50:06 \n", - "order_id dd4f8f0a-c2cb-45c6-a002-c3be6b305e5f \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 46 \n", - "origin_platform ANDROID \n", - "\n", - " 1 \\\n", - "cpf 43352103961 \n", - "customer_id e969cc0d-388b-4025-9351-0db0f718d81c \n", - "customer_name MICHELLE \n", - "delivery_address_city SANTOS \n", - "delivery_address_country BR \n", - "delivery_address_district CAMPO GRANDE \n", - "delivery_address_external_id 8759216 \n", - "delivery_address_latitude -46.34 \n", - "delivery_address_longitude -23.96 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 11070 \n", - "items [{\"name\": \"Filé Mignon à Cubana\", \"addition\": ... \n", - "merchant_id 927d46f9-4bb3-48f7-be1d-584deaf18adc \n", - "merchant_latitude -46.34 \n", - "merchant_longitude -23.96 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 17:51:26 \n", - "order_id 8dd80f0b-db00-4b88-b7e2-02ca706fc5a5 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 104.5 \n", - "origin_platform ANDROID \n", - "\n", - " 2 \n", - "cpf 38650991217 \n", - "customer_id e08dcc8b-f998-405e-b3f2-7107ea8958cf \n", - "customer_name VICTOR \n", - "delivery_address_city GUARULHOS \n", - "delivery_address_country BR \n", - "delivery_address_district JARDIM ROSSI \n", - "delivery_address_external_id 8765930 \n", - "delivery_address_latitude -46.53 \n", - "delivery_address_longitude -23.44 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 71304 \n", - "items [{\"name\": \"GRANDE 2 SABORES\", \"addition\": {\"va... \n", - "merchant_id 71ad62c5-5947-4518-9846-976fbdd2f881 \n", - "merchant_latitude -46.53 \n", - "merchant_longitude -23.44 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-17 22:53:47 \n", - "order_id 430f9887-a563-45ee-8001-1cb29597d9dd \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 35 \n", - "origin_platform IOS " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "o_df = spark.read.parquet(str(RAW_DATA_PATH / 'order'))\n", - "explore_dataframe(o_df)\n", - "o_df.limit(3).toPandas().T" + "%load_ext autoreload\n", + "%autoreload 2\n", + "from src.config import RAW_DATA_PATH\n", + "from src.DataProcessor import explore_dataframe, extract_latest_values, add_prefix, create_trusted_order\n", + "from src.IOController import create_pyspark_session" ] }, { - "cell_type": "code", - "execution_count": 4, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (809323, 7)\n", - "root\n", - " |-- customer_id: string (nullable = true)\n", - " |-- language: string (nullable = true)\n", - " |-- created_at: string (nullable = true)\n", - " |-- active: string (nullable = true)\n", - " |-- customer_name: string (nullable = true)\n", - " |-- customer_phone_area: string (nullable = true)\n", - " |-- customer_phone_number: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
customer_id00039466-560f-4e57-85a2-d4753cd901be001a1267-31a3-4f5b-a028-d7e323864b08003ae1d5-67b8-4a04-b055-0e4e9622771a
consumer_languagept-brpt-brpt-br
consumer_created_at2018-04-05T14:49:18.165Z2018-01-14T21:40:02.141Z2018-01-07T03:47:15.554Z
consumer_activetruetruetrue
consumer_customer_nameNUNOADRIELLYPAULA
consumer_customer_phone_area465962
consumer_customer_phone_number816135924231330577347597883
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "customer_id 00039466-560f-4e57-85a2-d4753cd901be \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-04-05T14:49:18.165Z \n", - "consumer_active true \n", - "consumer_customer_name NUNO \n", - "consumer_customer_phone_area 46 \n", - "consumer_customer_phone_number 816135924 \n", - "\n", - " 1 \\\n", - "customer_id 001a1267-31a3-4f5b-a028-d7e323864b08 \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-14T21:40:02.141Z \n", - "consumer_active true \n", - "consumer_customer_name ADRIELLY \n", - "consumer_customer_phone_area 59 \n", - "consumer_customer_phone_number 231330577 \n", - "\n", - " 2 \n", - "customer_id 003ae1d5-67b8-4a04-b055-0e4e9622771a \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-07T03:47:15.554Z \n", - "consumer_active true \n", - "consumer_customer_name PAULA \n", - "consumer_customer_phone_area 62 \n", - "consumer_customer_phone_number 347597883 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "table = 'consumer'\n", - "c_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", - "explore_dataframe(c_df)\n", - "c_df = add_prefix(c_df, table)\n", - "c_df.limit(3).toPandas().T" + "Starts SparkSession." ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, + "execution_count": 32, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(#rows, #columns) = (7292, 12)\n", - "root\n", - " |-- id: string (nullable = true)\n", - " |-- created_at: string (nullable = true)\n", - " |-- enabled: string (nullable = true)\n", - " |-- price_range: string (nullable = true)\n", - " |-- average_ticket: string (nullable = true)\n", - " |-- takeout_time: string (nullable = true)\n", - " |-- delivery_time: string (nullable = true)\n", - " |-- minimum_order_value: string (nullable = true)\n", - " |-- merchant_zip_code: string (nullable = true)\n", - " |-- merchant_city: string (nullable = true)\n", - " |-- merchant_state: string (nullable = true)\n", - " |-- merchant_country: string (nullable = true)\n", - "\n" + "Starting PySpark session. Check your terminal for detailed logging...\n", + "PySpark session sucessfully created.\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
012
merchant_id02c94103-61f3-4906-a4a9-55611db9f28c15e7f5fd-090d-47b9-9f14-b6f7fce3c95d33ca5d3d-b99f-404d-84d9-8df8f38a2261
restaurant_created_at2017-01-23T12:52:30.910Z2017-01-20T13:14:48.286Z2017-01-23T12:46:33.457Z
restaurant_enabledfalsetruetrue
restaurant_price_range335
restaurant_average_ticket60.060.0100.0
restaurant_takeout_time000
restaurant_delivery_time50045
restaurant_minimum_order_value30.030.010.0
restaurant_merchant_zip_code140255018023090
restaurant_merchant_cityRIBEIRAO PRETOSAO PAULORIO DE JANEIRO
restaurant_merchant_stateSPSPRJ
restaurant_merchant_countryBRBRBR
\n", - "
" - ], - "text/plain": [ - " 0 \\\n", - "merchant_id 02c94103-61f3-4906-a4a9-55611db9f28c \n", - "restaurant_created_at 2017-01-23T12:52:30.910Z \n", - "restaurant_enabled false \n", - "restaurant_price_range 3 \n", - "restaurant_average_ticket 60.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 50 \n", - "restaurant_minimum_order_value 30.0 \n", - "restaurant_merchant_zip_code 14025 \n", - "restaurant_merchant_city RIBEIRAO PRETO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "\n", - " 1 \\\n", - "merchant_id 15e7f5fd-090d-47b9-9f14-b6f7fce3c95d \n", - "restaurant_created_at 2017-01-20T13:14:48.286Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 3 \n", - "restaurant_average_ticket 60.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 0 \n", - "restaurant_minimum_order_value 30.0 \n", - "restaurant_merchant_zip_code 50180 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "\n", - " 2 \n", - "merchant_id 33ca5d3d-b99f-404d-84d9-8df8f38a2261 \n", - "restaurant_created_at 2017-01-23T12:46:33.457Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 5 \n", - "restaurant_average_ticket 100.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 45 \n", - "restaurant_minimum_order_value 10.0 \n", - "restaurant_merchant_zip_code 23090 \n", - "restaurant_merchant_city RIO DE JANEIRO \n", - "restaurant_merchant_state RJ \n", - "restaurant_merchant_country BR " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "table = 'restaurant'\n", - "r_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", - "explore_dataframe(r_df)\n", - "r_df = r_df.withColumnRenamed('id', 'merchant_id')\n", - "r_df = add_prefix(r_df, table)\n", - "r_df.limit(3).toPandas().T" + "spark = create_pyspark_session()" ] }, { - "cell_type": "code", - "execution_count": 6, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(#rows, #columns) = (11075048, 4)\n", - "root\n", - " |-- created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- status_id: string (nullable = true)\n", - " |-- value: string (nullable = true)\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-24 23:04:270002fe02-d7dc-4232-b7ac-3394019ce2407964bf63-007a-484d-a321-e9118ccc2f97REGISTERED
22019-01-24 23:04:280002fe02-d7dc-4232-b7ac-3394019ce240ca16b92b-db8f-4274-b165-929675541a9fPLACED
\n", - "
" - ], - "text/plain": [ - " status_created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-24 23:04:27 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "2 2019-01-24 23:04:28 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "\n", - " status_id status_value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 7964bf63-007a-484d-a321-e9118ccc2f97 REGISTERED \n", - "2 ca16b92b-db8f-4274-b165-929675541a9f PLACED " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "table = 'status'\n", - "s_df = spark.read.parquet(str(RAW_DATA_PATH / table))\n", - "explore_dataframe(s_df)\n", - "s_df = add_prefix(s_df, table)\n", - "s_df.limit(3).toPandas()" + "Generates Order dataset for trusted layer." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, + "execution_count": 33, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(#rows, #columns) = (2441067, 4)\n", - "root\n", - " |-- status_created_at: timestamp (nullable = true)\n", - " |-- order_id: string (nullable = true)\n", - " |-- status_id: string (nullable = true)\n", - " |-- status_value: string (nullable = true)\n", - "\n" + "Starting processing to generate Order Items dataset...\n", + "Exporting dataset file system...\n", + "Dataset sucessfully exported to `/home/jovyan/data/trusted/order`!\n" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_created_atorder_idstatus_idstatus_value
02019-01-25 01:05:070002fe02-d7dc-4232-b7ac-3394019ce240b4298862-fa38-499a-93e2-a76930fb2bceCONCLUDED
12019-01-03 20:15:060012d95c-9c4b-4244-86b5-dcf87677dcc1c8058ba4-1d1c-4a1f-8401-a7042ef6ba85CONCLUDED
22019-01-06 16:20:270013fc5c-4c10-4402-886c-1b8166e4632ed0a3ffd5-4e48-4cc4-9739-d5764678c19fCONCLUDED
32019-01-03 15:55:1200251da6-aa45-4512-be58-6622a248cdffa621de13-5272-4c7c-969c-3bcd53f0515fCONCLUDED
42019-01-21 22:20:0200273652-efc8-4e7c-95b2-cd3827900e7e74f8ff33-e731-477f-a698-4755577b80a1CONCLUDED
\n", - "
" - ], - "text/plain": [ - " status_created_at order_id \\\n", - "0 2019-01-25 01:05:07 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "1 2019-01-03 20:15:06 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "2 2019-01-06 16:20:27 0013fc5c-4c10-4402-886c-1b8166e4632e \n", - "3 2019-01-03 15:55:12 00251da6-aa45-4512-be58-6622a248cdff \n", - "4 2019-01-21 22:20:02 00273652-efc8-4e7c-95b2-cd3827900e7e \n", - "\n", - " status_id status_value \n", - "0 b4298862-fa38-499a-93e2-a76930fb2bce CONCLUDED \n", - "1 c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 CONCLUDED \n", - "2 d0a3ffd5-4e48-4cc4-9739-d5764678c19f CONCLUDED \n", - "3 a621de13-5272-4c7c-969c-3bcd53f0515f CONCLUDED \n", - "4 74f8ff33-e731-477f-a698-4755577b80a1 CONCLUDED " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ - "s_df = extract_latest_values(df=s_df, id_col='order_id', dt_col='status_created_at')\n", - "explore_dataframe(s_df)\n", - "s_df.limit(5).toPandas()" + "tmp = create_trusted_order(spark)" ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+-------+\n", - "|status_value| count|\n", - "+------------+-------+\n", - "| CONCLUDED|2354218|\n", - "| CANCELLED| 55179|\n", - "| PLACED| 31654|\n", - "| REGISTERED| 16|\n", - "+------------+-------+\n", - "\n" - ] - } - ], "source": [ - "s_df.groupBy('status_value').count().sort('count', ascending=False).show()" + "Shows 3 rows." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -922,31 +155,19 @@ " e2213649-cede-4770-b6e0-7ac1dd4d3548\n", " \n", " \n", - " cpf\n", - " 16854185492\n", - " 05264105611\n", - " 78163676650\n", - " \n", - " \n", - " customer_name\n", - " EDUARDO\n", - " BRUNO\n", - " BRUNO\n", - " \n", - " \n", - " delivery_address_city\n", + " order_delivery_address_city\n", " SAO PAULO\n", " SAO PAULO\n", " SAO PAULO\n", " \n", " \n", - " delivery_address_country\n", + " order_delivery_address_country\n", " BR\n", " BR\n", " BR\n", " \n", " \n", - " delivery_address_district\n", + " order_delivery_address_district\n", " ITAIM BIBI\n", " PENHA DE FRANCA\n", " PENHA DE FRANCA\n", @@ -958,79 +179,79 @@ " 7632090\n", " \n", " \n", - " delivery_address_latitude\n", + " order_delivery_address_latitude\n", " -46.68\n", " -46.54\n", " -46.54\n", " \n", " \n", - " delivery_address_longitude\n", + " order_delivery_address_longitude\n", " -23.59\n", " -23.52\n", " -23.52\n", " \n", " \n", - " delivery_address_state\n", + " order_delivery_address_state\n", " SP\n", " SP\n", " SP\n", " \n", " \n", - " delivery_address_zip_code\n", + " order_delivery_address_zip_code\n", " 45381\n", " 36100\n", " 36100\n", " \n", " \n", - " items\n", + " order_items\n", " [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v...\n", " [{\"name\": \"Porção Batata Frita\", \"addition\": {...\n", " [{\"name\": \"Porção Batata Frita\", \"addition\": {...\n", " \n", " \n", - " merchant_latitude\n", + " order_merchant_latitude\n", " -46.68\n", " -46.54\n", " -46.54\n", " \n", " \n", - " merchant_longitude\n", + " order_merchant_longitude\n", " -23.59\n", " -23.52\n", " -23.52\n", " \n", " \n", - " merchant_timezone\n", + " order_merchant_timezone\n", " America/Sao_Paulo\n", " America/Sao_Paulo\n", " America/Sao_Paulo\n", " \n", " \n", - " order_created_at\n", + " order_order_created_at\n", " 2019-01-24 23:04:27\n", " 2019-01-03 18:12:24\n", " 2018-12-04 18:12:24\n", " \n", " \n", - " order_scheduled\n", + " order_order_scheduled\n", " False\n", " False\n", " False\n", " \n", " \n", - " order_scheduled_date\n", + " order_order_scheduled_date\n", " NaT\n", " NaT\n", " NaT\n", " \n", " \n", - " order_total_amount\n", + " order_order_total_amount\n", " 27\n", " 17.5\n", " 17.5\n", " \n", " \n", - " origin_platform\n", + " order_origin_platform\n", " ANDROID\n", " ANDROID\n", " ANDROID\n", @@ -1054,24 +275,12 @@ " true\n", " \n", " \n", - " consumer_customer_name\n", - " EDUARDO\n", - " BRUNO\n", - " BRUNO\n", - " \n", - " \n", " consumer_customer_phone_area\n", " 83\n", " 60\n", " 60\n", " \n", " \n", - " consumer_customer_phone_number\n", - " 020082840\n", - " 109441873\n", - " 109441873\n", - " \n", - " \n", " restaurant_created_at\n", " 2017-01-20T13:14:41.451Z\n", " 2017-01-20T13:14:16.179Z\n", @@ -1160,183 +369,141 @@ "" ], "text/plain": [ - " 0 \\\n", - "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", - "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n", - "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n", - "cpf 16854185492 \n", - "customer_name EDUARDO \n", - "delivery_address_city SAO PAULO \n", - "delivery_address_country BR \n", - "delivery_address_district ITAIM BIBI \n", - "delivery_address_external_id 8847122 \n", - "delivery_address_latitude -46.68 \n", - "delivery_address_longitude -23.59 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 45381 \n", - "items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n", - "merchant_latitude -46.68 \n", - "merchant_longitude -23.59 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-24 23:04:27 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 27 \n", - "origin_platform ANDROID \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-04-05T13:20:39.644Z \n", - "consumer_active true \n", - "consumer_customer_name EDUARDO \n", - "consumer_customer_phone_area 83 \n", - "consumer_customer_phone_number 020082840 \n", - "restaurant_created_at 2017-01-20T13:14:41.451Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 1 \n", - "restaurant_average_ticket 30.0 \n", - "restaurant_takeout_time 0 \n", - "restaurant_delivery_time 40 \n", - "restaurant_minimum_order_value 30.0 \n", - "restaurant_merchant_zip_code 56560 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "status_created_at 2019-01-25 01:05:07 \n", - "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n", - "status_value CONCLUDED \n", + " 0 \\\n", + "order_id 0002fe02-d7dc-4232-b7ac-3394019ce240 \n", + "merchant_id e3d24e4d-2f51-4987-8c39-47923b20b9be \n", + "customer_id 97c53c25-bd9b-41cb-8a0d-13cd74509f17 \n", + "order_delivery_address_city SAO PAULO \n", + "order_delivery_address_country BR \n", + "order_delivery_address_district ITAIM BIBI \n", + "delivery_address_external_id 8847122 \n", + "order_delivery_address_latitude -46.68 \n", + "order_delivery_address_longitude -23.59 \n", + "order_delivery_address_state SP \n", + "order_delivery_address_zip_code 45381 \n", + "order_items [{\"name\": \"Pastel Frangolino\", \"addition\": {\"v... \n", + "order_merchant_latitude -46.68 \n", + "order_merchant_longitude -23.59 \n", + "order_merchant_timezone America/Sao_Paulo \n", + "order_order_created_at 2019-01-24 23:04:27 \n", + "order_order_scheduled False \n", + "order_order_scheduled_date NaT \n", + "order_order_total_amount 27 \n", + "order_origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-04-05T13:20:39.644Z \n", + "consumer_active true \n", + "consumer_customer_phone_area 83 \n", + "restaurant_created_at 2017-01-20T13:14:41.451Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 0 \n", + "restaurant_delivery_time 40 \n", + "restaurant_minimum_order_value 30.0 \n", + "restaurant_merchant_zip_code 56560 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-25 01:05:07 \n", + "status_id b4298862-fa38-499a-93e2-a76930fb2bce \n", + "status_value CONCLUDED \n", "\n", - " 1 \\\n", - "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", - "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", - "cpf 05264105611 \n", - "customer_name BRUNO \n", - "delivery_address_city SAO PAULO \n", - "delivery_address_country BR \n", - "delivery_address_district PENHA DE FRANCA \n", - "delivery_address_external_id 7632090 \n", - "delivery_address_latitude -46.54 \n", - "delivery_address_longitude -23.52 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 36100 \n", - "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", - "merchant_latitude -46.54 \n", - "merchant_longitude -23.52 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2019-01-03 18:12:24 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 17.5 \n", - "origin_platform ANDROID \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-06T14:31:43.348Z \n", - "consumer_active true \n", - "consumer_customer_name BRUNO \n", - "consumer_customer_phone_area 60 \n", - "consumer_customer_phone_number 109441873 \n", - "restaurant_created_at 2017-01-20T13:14:16.179Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 1 \n", - "restaurant_average_ticket 30.0 \n", - "restaurant_takeout_time 20 \n", - "restaurant_delivery_time 0 \n", - "restaurant_minimum_order_value 0.0 \n", - "restaurant_merchant_zip_code 36350 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "status_created_at 2019-01-03 20:15:06 \n", - "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", - "status_value CONCLUDED \n", + " 1 \\\n", + "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", + "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", + "order_delivery_address_city SAO PAULO \n", + "order_delivery_address_country BR \n", + "order_delivery_address_district PENHA DE FRANCA \n", + "delivery_address_external_id 7632090 \n", + "order_delivery_address_latitude -46.54 \n", + "order_delivery_address_longitude -23.52 \n", + "order_delivery_address_state SP \n", + "order_delivery_address_zip_code 36100 \n", + "order_items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", + "order_merchant_latitude -46.54 \n", + "order_merchant_longitude -23.52 \n", + "order_merchant_timezone America/Sao_Paulo \n", + "order_order_created_at 2019-01-03 18:12:24 \n", + "order_order_scheduled False \n", + "order_order_scheduled_date NaT \n", + "order_order_total_amount 17.5 \n", + "order_origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-06T14:31:43.348Z \n", + "consumer_active true \n", + "consumer_customer_phone_area 60 \n", + "restaurant_created_at 2017-01-20T13:14:16.179Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 20 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 0.0 \n", + "restaurant_merchant_zip_code 36350 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-03 20:15:06 \n", + "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", + "status_value CONCLUDED \n", "\n", - " 2 \n", - "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", - "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", - "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", - "cpf 78163676650 \n", - "customer_name BRUNO \n", - "delivery_address_city SAO PAULO \n", - "delivery_address_country BR \n", - "delivery_address_district PENHA DE FRANCA \n", - "delivery_address_external_id 7632090 \n", - "delivery_address_latitude -46.54 \n", - "delivery_address_longitude -23.52 \n", - "delivery_address_state SP \n", - "delivery_address_zip_code 36100 \n", - "items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", - "merchant_latitude -46.54 \n", - "merchant_longitude -23.52 \n", - "merchant_timezone America/Sao_Paulo \n", - "order_created_at 2018-12-04 18:12:24 \n", - "order_scheduled False \n", - "order_scheduled_date NaT \n", - "order_total_amount 17.5 \n", - "origin_platform ANDROID \n", - "consumer_language pt-br \n", - "consumer_created_at 2018-01-06T14:31:43.348Z \n", - "consumer_active true \n", - "consumer_customer_name BRUNO \n", - "consumer_customer_phone_area 60 \n", - "consumer_customer_phone_number 109441873 \n", - "restaurant_created_at 2017-01-20T13:14:16.179Z \n", - "restaurant_enabled true \n", - "restaurant_price_range 1 \n", - "restaurant_average_ticket 30.0 \n", - "restaurant_takeout_time 20 \n", - "restaurant_delivery_time 0 \n", - "restaurant_minimum_order_value 0.0 \n", - "restaurant_merchant_zip_code 36350 \n", - "restaurant_merchant_city SAO PAULO \n", - "restaurant_merchant_state SP \n", - "restaurant_merchant_country BR \n", - "status_created_at 2019-01-03 20:15:06 \n", - "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", - "status_value CONCLUDED " + " 2 \n", + "order_id 0012d95c-9c4b-4244-86b5-dcf87677dcc1 \n", + "merchant_id e66b0dcc-ffa9-42ee-a864-60977672c3ef \n", + "customer_id e2213649-cede-4770-b6e0-7ac1dd4d3548 \n", + "order_delivery_address_city SAO PAULO \n", + "order_delivery_address_country BR \n", + "order_delivery_address_district PENHA DE FRANCA \n", + "delivery_address_external_id 7632090 \n", + "order_delivery_address_latitude -46.54 \n", + "order_delivery_address_longitude -23.52 \n", + "order_delivery_address_state SP \n", + "order_delivery_address_zip_code 36100 \n", + "order_items [{\"name\": \"Porção Batata Frita\", \"addition\": {... \n", + "order_merchant_latitude -46.54 \n", + "order_merchant_longitude -23.52 \n", + "order_merchant_timezone America/Sao_Paulo \n", + "order_order_created_at 2018-12-04 18:12:24 \n", + "order_order_scheduled False \n", + "order_order_scheduled_date NaT \n", + "order_order_total_amount 17.5 \n", + "order_origin_platform ANDROID \n", + "consumer_language pt-br \n", + "consumer_created_at 2018-01-06T14:31:43.348Z \n", + "consumer_active true \n", + "consumer_customer_phone_area 60 \n", + "restaurant_created_at 2017-01-20T13:14:16.179Z \n", + "restaurant_enabled true \n", + "restaurant_price_range 1 \n", + "restaurant_average_ticket 30.0 \n", + "restaurant_takeout_time 20 \n", + "restaurant_delivery_time 0 \n", + "restaurant_minimum_order_value 0.0 \n", + "restaurant_merchant_zip_code 36350 \n", + "restaurant_merchant_city SAO PAULO \n", + "restaurant_merchant_state SP \n", + "restaurant_merchant_country BR \n", + "status_created_at 2019-01-03 20:15:06 \n", + "status_id c8058ba4-1d1c-4a1f-8401-a7042ef6ba85 \n", + "status_value CONCLUDED " ] }, - "execution_count": 9, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tmp = (o_df\n", - " .join(c_df, on='customer_id', how='left')\n", - " .join(r_df, on='merchant_id', how='left')\n", - " .join(s_df, on='order_id', how='left')\n", - " )\n", "tmp.limit(3).toPandas().T" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Fix schema and anonymize sensitive data" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# for dtype, cols in dtypes.items():\n", - "# for col in cols:\n", - "# df = df.withColumn(col, df[col].cast(dtype))" + "Order dataset successfully generated!" ] } ], diff --git a/dev/docker-volume/src/DataProcessor.py b/dev/docker-volume/src/DataProcessor.py index dc48e14..6e6d9c6 100644 --- a/dev/docker-volume/src/DataProcessor.py +++ b/dev/docker-volume/src/DataProcessor.py @@ -1,7 +1,7 @@ -from src.config import TRUSTED_DATA_PATH, RAW_DATA_PATH from pyspark.sql import Window from pyspark.sql.dataframe import DataFrame from pyspark.sql.session import SparkSession +from src.config import TRUSTED_DATA_PATH, RAW_DATA_PATH from pyspark.sql.types import ArrayType, StructField, StructType, StringType, FloatType from pyspark.sql.functions import from_json, explode, flatten, col, rank, col, monotonically_increasing_id, desc @@ -13,7 +13,7 @@ def explore_dataframe(df:DataFrame): print('(#rows, #columns) =', (df.count(), len(df.columns))) return df.printSchema() -def fix_schema(df:DataFrame, dtypes:dict): +def fix_dataframe_dtypes(df:DataFrame, dtypes:dict): """ Return DataFrame `df` with corrected schema based on dtypes """ @@ -23,15 +23,16 @@ def fix_schema(df:DataFrame, dtypes:dict): return df -def fix_order_schema(df): +def fix_order_dtypes(df:DataFrame): dtypes = { 'float': [ - 'delivery_address_latitude', 'delivery_address_longitude', - 'merchant_latitude', 'merchant_longitude', 'order_total_amount'], + 'order_delivery_address_latitude', 'order_delivery_address_longitude', 'order_merchant_latitude', + 'order_merchant_longitude', 'order_order_total_amount', 'restaurant_price_range', + 'restaurant_average_ticket', 'restaurant_takeout_time', 'restaurant_delivery_time'], 'bigint': [ - 'cpf', 'delivery_address_zip_code'] - } + 'order_delivery_address_zip_code', 'restaurant_merchant_zip_code' + ]} df = fix_schema(df, dtypes) @@ -68,7 +69,7 @@ def create_trusted_order_items(spark:SparkSession): tmp = tmp.dropDuplicates() output_path = TRUSTED_DATA_PATH / 'order_items' - print(f'Exporting dataset file system...') + print(f'Exporting dataset to file system...') tmp.write.parquet(str(output_path)) @@ -91,6 +92,9 @@ def extract_latest_values(df:DataFrame, id_col:str, dt_col:str): return df def load_sanitized_dataframe(table:str, spark:SparkSession): + """ + Loads DataFrame into standard for produced joind datamart dataset `Order`. + """ df = spark.read.parquet(str(RAW_DATA_PATH / table)) df = add_prefix(df, table) @@ -118,7 +122,7 @@ def create_trusted_order(spark:SparkSession): .dropDuplicates() ) - print(f'Exporting dataset file system...') + print(f'Exporting dataset to file system...') output_path = TRUSTED_DATA_PATH / 'order' @@ -126,7 +130,11 @@ def create_trusted_order(spark:SparkSession): sensitive_data_columns = ['order_cpf', 'order_customer_name', 'consumer_customer_name', 'consumer_customer_phone_number'] tmp = tmp.drop(*sensitive_data_columns) - tmp.write.parquet(str(output_path)) + # fix dataset data types + tmp = fix_order_dtypes(tmp) + + # exports data partinioned by merchant's time at order creation + tmp.write.partitionBy('order_order_created_at').parquet(str(output_path)) print(f'Dataset sucessfully exported to `{output_path}`!') diff --git a/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc b/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc index 2847d51..9486761 100644 Binary files a/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc and b/dev/docker-volume/src/__pycache__/DataProcessor.cpython-38.pyc differ