From 4dabad53ea24fb1a091ae8b6efb094d6f1f34d15 Mon Sep 17 00:00:00 2001 From: Luiz Eduardo Date: Wed, 17 Jul 2024 02:25:29 -0300 Subject: [PATCH] Adicionar dataset world_olympedia_olympics --- dbt_project.yml | 3 + .../code/[code]world_olympedia_olympics.ipynb | 296 ++++++++++++++++++ models/world_olympedia_olympics/schema.yml | 198 ++++++++++++ .../world_olympedia_olympics__athlete_bio.sql | 14 + ...ympedia_olympics__athlete_event_result.sql | 14 + .../world_olympedia_olympics__country.sql | 3 + .../world_olympedia_olympics__game.sql | 13 + ...d_olympedia_olympics__game_medal_tally.sql | 12 + .../world_olympedia_olympics__result.sql | 15 + 9 files changed, 568 insertions(+) create mode 100644 models/world_olympedia_olympics/code/[code]world_olympedia_olympics.ipynb create mode 100644 models/world_olympedia_olympics/schema.yml create mode 100644 models/world_olympedia_olympics/world_olympedia_olympics__athlete_bio.sql create mode 100644 models/world_olympedia_olympics/world_olympedia_olympics__athlete_event_result.sql create mode 100644 models/world_olympedia_olympics/world_olympedia_olympics__country.sql create mode 100644 models/world_olympedia_olympics/world_olympedia_olympics__game.sql create mode 100644 models/world_olympedia_olympics/world_olympedia_olympics__game_medal_tally.sql create mode 100644 models/world_olympedia_olympics/world_olympedia_olympics__result.sql diff --git a/dbt_project.yml b/dbt_project.yml index 4b32613c..f44fd18a 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -345,6 +345,9 @@ models: world_oecd_public_finance: +materialized: table +schema: world_oecd_public_finance + world_olympedia_olympics: + +materialized: table + +schema: world_olympedia_olympics world_wb_mides: +materialized: table +schema: world_wb_mides diff --git a/models/world_olympedia_olympics/code/[code]world_olympedia_olympics.ipynb b/models/world_olympedia_olympics/code/[code]world_olympedia_olympics.ipynb new file mode 100644 index 00000000..99dc4cae --- /dev/null +++ b/models/world_olympedia_olympics/code/[code]world_olympedia_olympics.ipynb @@ -0,0 +1,296 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Download datasets (Necessario chave api kaggle)\n", + "\n", + "- Crie uma conta em [Kaggle](https://www.kaggle.com/)\n", + "- Settings\n", + "- Api\n", + "- Create New Token" + ], + "metadata": { + "id": "DdcVtltbUNlc" + } + }, + { + "cell_type": "code", + "source": [ + "from IPython.display import clear_output\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "import re\n", + "import numpy as np\n", + "import os\n", + "\n", + "! pip install -q opendatasets\n", + "clear_output()\n", + "\n", + "import opendatasets as od\n", + "\n", + "od.download('https://www.kaggle.com/datasets/josephcheng123456/olympic-historical-dataset-from-olympediaorg')" + ], + "metadata": { + "id": "rtDRFJrQT2sj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Codigos" + ], + "metadata": { + "id": "HdWEm92kP4QX" + } + }, + { + "cell_type": "markdown", + "source": [ + "## imports" + ], + "metadata": { + "id": "5Spt1SLjgu50" + } + }, + { + "cell_type": "code", + "source": [ + "def conv_data(valor: str) -> str|None:\n", + " try:\n", + " data_datetime = datetime.strptime(valor, \"%d %B %Y\")\n", + " return data_datetime\n", + " except:\n", + " return None\n", + "\n", + "def get_year(valor: str) -> str|None:\n", + " try:\n", + " year = re.findall(r'\\d{4}', valor)[0]\n", + " return np.int64(year)\n", + " except:\n", + " return pd.NA" + ], + "metadata": { + "id": "Jw9dHrsn6bho" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## athlete_bio" + ], + "metadata": { + "id": "iFii9f1GewTE" + } + }, + { + "cell_type": "code", + "source": [ + "from datetime import datetime\n", + "import re\n", + "import numpy as np\n", + "import os\n", + "\n", + "\n", + "def conv_data(valor: str) -> str|None:\n", + " try:\n", + " data_datetime = datetime.strptime(valor, \"%d %B %Y\")\n", + " return data_datetime\n", + " except:\n", + " return None\n", + "\n", + "def get_year(valor: str) -> str|None:\n", + " try:\n", + " year = re.findall(r'\\d{4}', valor)[0]\n", + " return np.int64(year)\n", + " except:\n", + " return pd.NA\n", + "\n", + "dtypes = {\n", + " \"athlete_id\": str,\n", + " \"name\": str,\n", + " \"sex\": str,\n", + " \"born\": str,\n", + " \"height\": float,\n", + " \"country\": str,\n", + " \"country_noc\": str,\n", + " \"description\": str,\n", + " \"special_notes\": str\n", + " }\n", + "\n", + "df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Athlete_Bio.csv\",\n", + " dtype=dtypes)\n", + "\n", + "df.weight = pd.to_numeric(df.weight, errors='coerce')\n", + "\n", + "tdata = df['born'].apply(conv_data)\n", + "anos = df['born'].apply(get_year)\n", + "\n", + "tdata = tdata.dt.strftime('%Y-%m-%d')\n", + "df.born = tdata\n", + "\n", + "df[\"year_born\"] = anos\n", + "\n", + "os.makedirs(\"output\", exist_ok=True)\n", + "\n", + "df.to_csv(\"output/athlete_bio.csv\", index=False)" + ], + "metadata": { + "id": "dtuptL0DZ3dL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## athlete_event_result" + ], + "metadata": { + "id": "q_GAqd9pezNj" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Athlete_Event_Results.csv\", dtype=str)\n", + "\n", + "os.makedirs(\"output\", exist_ok=True)\n", + "\n", + "df.to_csv(\"/content/output/athlete_event_result.csv\", index=False)" + ], + "metadata": { + "id": "Jp7kH493eVnf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## game_medal_tally" + ], + "metadata": { + "id": "7H7N6J8bgVfc" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Games_Medal_Tally.csv\",\n", + " dtype={\"edition_id\": str})\n", + "\n", + "os.makedirs(\"output\", exist_ok=True)\n", + "\n", + "df.to_csv(\"output/game_medal_tally.csv\", index=False)" + ], + "metadata": { + "id": "qY7T15cRgd7-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## result" + ], + "metadata": { + "id": "VfhzG5k3xo5F" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Results.csv\",\n", + " dtype=str)\n", + "\n", + "os.makedirs(\"output\", exist_ok=True)\n", + "\n", + "df.to_csv(\"output/result.csv\", index=False)" + ], + "metadata": { + "id": "KG0GjZUoxrzV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## country" + ], + "metadata": { + "id": "PgOFkMmD2oYW" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympics_Country.csv\",\n", + " dtype=str)\n", + "\n", + "os.makedirs(\"output\", exist_ok=True)\n", + "\n", + "df.to_csv(\"output/country.csv\", index=False)" + ], + "metadata": { + "id": "jVqnhaHklYIm" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## game" + ], + "metadata": { + "id": "3PJoF3tQ6WHm" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympics_Games.csv\",\n", + " dtype={\"edition_id\": str})\n", + "\n", + "start_tdata = df.start_date.str.replace(\" 2021\", \"\") + \" \" + df.year.astype(str) # Uma das datas já se encontra com o ano de 2021, necessario remoção\n", + "end_tdata = df.end_date.str.replace(\" 2021\", \"\") + \" \" + df.year.astype(str)\n", + "\n", + "start_tdata = start_tdata.apply(conv_data)\n", + "end_tdata = end_tdata.apply(conv_data)\n", + "\n", + "df[\"start_date\"] = start_tdata\n", + "df[\"end_date\"] = end_tdata\n", + "\n", + "os.makedirs(\"output\", exist_ok=True)\n", + "\n", + "df.to_csv(\"output/game.csv\", index=False)" + ], + "metadata": { + "id": "8iVD-Xom6Xp_" + }, + "execution_count": null, + "outputs": [] + } + ] +} diff --git a/models/world_olympedia_olympics/schema.yml b/models/world_olympedia_olympics/schema.yml new file mode 100644 index 00000000..8b5a0a9c --- /dev/null +++ b/models/world_olympedia_olympics/schema.yml @@ -0,0 +1,198 @@ +--- +version: 2 +models: + - name: world_olympedia_olympics__athlete_bio + description: Each role represents an Olympic athlete and their biological information + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: [athlete_id] + - not_null_proportion_multiple_columns: + at_least: 0.30 + columns: + - name: athlete_id + description: Unique Athlete ID + - name: name + description: Full Name of the Athlete + - name: sex + description: Male or Female + - name: birth_date + description: Date of Birth + tests: + - relationships: + to: ref('br_bd_diretorios_data_tempo__data') + field: data.data + - name: birth_year + description: Year of birth + tests: + - relationships: + to: ref('br_bd_diretorios_data_tempo__ano') + field: ano.ano + - name: height + description: Height of Athlete + - name: weight + description: Weight of Athlete + - name: country + description: Country the Athlete represents + - name: country_noc + description: Country Code the Athlete represents + - name: description + description: Some description about the player in paragraph + - name: special_notes + description: Special notes about the player in dotpoints + - name: world_olympedia_olympics__athlete_event_result + description: Event to Athlete Results Dataset. Each role contains a specfic Olympic + event / sport with the Athlete participating in the event, including the results + (Ranking) which they obtained. Each event could be the same sport but different + ID based on the different year the sport was played. + tests: + - not_null_proportion_multiple_columns: + at_least: 0.10 + columns: + - name: edition + description: Year - Summer / Winter - Olympics + - name: edition_id + description: Olympic Game id which could be useful in retrieving the olympic + game url + - name: country_noc + description: Country Code which the Athlete competes for + - name: sport + description: The sport for that event. Serves as a parent of multiple events. + - name: event + description: The specific event under a certain sport + - name: result_id + description: Result / event id for the actual events page + - name: athlete + description: Athlete competing for the event + - name: athlete_id + description: Athlete id which can be used to join with athlete bio. + - name: pos + description: Position that the athlete ranked for the event + - name: medal + description: Whether or not the athlete won a medal, and if so gold, silver, + or bronze + - name: is_team_sport + description: Whether or not the event is a TeamSport + - name: world_olympedia_olympics__game_medal_tally + description: Medal Tally of countries who has won atleast one medal during the + Olympics game. Each role reprents a country in each of the Olympics game played. + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: [edition_id, country_noc] + - not_null_proportion_multiple_columns: + at_least: 0.95 + columns: + - name: year + description: Year which the game was hosted + tests: + - relationships: + to: ref('br_bd_diretorios_data_tempo__ano') + field: ano.ano + - name: edition + description: Year - Summer / Winter - Olympics + - name: edition_id + description: Olympic Game id which could be useful in retrieving the olympic + game url + - name: country + description: Country which the game was hosted + - name: country_noc + description: Country code which the game was hosted + - name: gold + description: Gold medal count + - name: silver + description: Silver medal count + - name: bronze + description: Bronze medal count + - name: total + description: Total + - name: world_olympedia_olympics__country + description: List of Countries who has participated in Modern Olympics and their + country code + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: [name] + - not_null_proportion_multiple_columns: + at_least: 0.95 + columns: + - name: noc + description: Country Code + - name: name + description: Country Name + - name: world_olympedia_olympics__result + description: Each Role represents detailed information on the results of a specific + sporting event. + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: [result_id] + - not_null_proportion_multiple_columns: + at_least: 0.95 + columns: + - name: result_id + description: Unique result id + - name: event_title + description: Event title of the result + - name: edition + description: Olympic Game + - name: edition_id + description: Olympic Game id which could be useful in retrieving the olympic + game url + - name: sport + description: Sport for the event + - name: sport_url + description: Url for the sport (higher umbrella) to the actual event + - name: result_date + description: Start date of the event + - name: result_location + description: Location where the event is hosted + - name: result_participants + description: The number of participants from number of countries + - name: result_format + description: The format of the event + - name: result_detail + description: Extra detail about the result + - name: result_description + description: Extra description about the result + - name: world_olympedia_olympics__game + description: List of all the Olympic Games from 1896 Athens to 2022 Beijing Olympics. + The games only includes Winter and Summer Olympic games. + tests: + - dbt_utils.unique_combination_of_columns: + combination_of_columns: [edition_id] + - not_null_proportion_multiple_columns: + at_least: 0.07 + columns: + - name: year + description: year of the Olympics game + tests: + - relationships: + to: ref('br_bd_diretorios_data_tempo__ano') + field: ano.ano + - name: edition + description: Year - Summer / Winter - Olympics + - name: edition_id + description: Olympic Game id which could be useful in retrieving the olympic + game url + - name: city + description: city that the Olympics game was hosted in + - name: country_flag_url + description: Country flag url for flag image icon + - name: country_noc + description: country noc code which can be linked to the olympics country + csv file for country name + - name: start_date + description: Start date of the olympics event + tests: + - custom_relationships: + to: ref('br_bd_diretorios_data_tempo__data') + field: data.data + proportion_allowed_failures: 0.2 + - name: end_date + description: End date of the olympic event + tests: + - custom_relationships: + to: ref('br_bd_diretorios_data_tempo__data') + field: data.data + proportion_allowed_failures: 0.2 + - name: competition_date + description: Date of the competition + - name: is_held + description: 'Whether or not the Olympic game happened or not due to war ' diff --git a/models/world_olympedia_olympics/world_olympedia_olympics__athlete_bio.sql b/models/world_olympedia_olympics/world_olympedia_olympics__athlete_bio.sql new file mode 100644 index 00000000..f4fbbc75 --- /dev/null +++ b/models/world_olympedia_olympics/world_olympedia_olympics__athlete_bio.sql @@ -0,0 +1,14 @@ +{{ config(alias="athlete_bio", schema="world_olympedia_olympics") }} +select + safe_cast(athlete_id as string) athlete_id, + safe_cast(name as string) name, + safe_cast(sex as string) sex, + safe_cast(born as date) birth_date, + safe_cast(year_born as int64) birth_year, + safe_cast(height as float64) height, + safe_cast(weight as float64) weight, + safe_cast(country as string) country, + safe_cast(country_noc as string) country_noc, + safe_cast(description as string) description, + safe_cast(special_notes as string) special_notes, +from `basedosdados-dev.world_olympedia_olympics_staging.athlete_bio` as t diff --git a/models/world_olympedia_olympics/world_olympedia_olympics__athlete_event_result.sql b/models/world_olympedia_olympics/world_olympedia_olympics__athlete_event_result.sql new file mode 100644 index 00000000..2b684aab --- /dev/null +++ b/models/world_olympedia_olympics/world_olympedia_olympics__athlete_event_result.sql @@ -0,0 +1,14 @@ +{{ config(alias="athlete_event_result", schema="world_olympedia_olympics") }} +select + safe_cast(edition as string) edition, + safe_cast(edition_id as string) edition_id, + safe_cast(country_noc as string) country_noc, + safe_cast(sport as string) sport, + safe_cast(event as string) event, + safe_cast(result_id as string) result_id, + safe_cast(athlete as string) athlete, + safe_cast(athlete_id as string) athlete_id, + safe_cast(pos as string) pos, + safe_cast(medal as string) medal, + safe_cast(isteamsport as bool) is_team_sport, +from `basedosdados-dev.world_olympedia_olympics_staging.athlete_event_result` as t diff --git a/models/world_olympedia_olympics/world_olympedia_olympics__country.sql b/models/world_olympedia_olympics/world_olympedia_olympics__country.sql new file mode 100644 index 00000000..73a5510d --- /dev/null +++ b/models/world_olympedia_olympics/world_olympedia_olympics__country.sql @@ -0,0 +1,3 @@ +{{ config(alias="country", schema="world_olympedia_olympics") }} +select safe_cast(noc as string) noc, safe_cast(country as string) name, +from `basedosdados-dev.world_olympedia_olympics_staging.country` as t diff --git a/models/world_olympedia_olympics/world_olympedia_olympics__game.sql b/models/world_olympedia_olympics/world_olympedia_olympics__game.sql new file mode 100644 index 00000000..ebac4d64 --- /dev/null +++ b/models/world_olympedia_olympics/world_olympedia_olympics__game.sql @@ -0,0 +1,13 @@ +{{ config(alias="game", schema="world_olympedia_olympics") }} +select + safe_cast(year as int64) year, + safe_cast(edition as string) edition, + safe_cast(edition_id as string) edition_id, + safe_cast(city as string) city, + safe_cast(country_flag_url as string) country_flag_url, + safe_cast(country_noc as string) country_noc, + safe_cast(start_date as date) start_date, + safe_cast(end_date as date) end_date, + safe_cast(competition_date as string) competition_date, + safe_cast(isheld as string) is_held, +from `basedosdados-dev.world_olympedia_olympics_staging.game` as t diff --git a/models/world_olympedia_olympics/world_olympedia_olympics__game_medal_tally.sql b/models/world_olympedia_olympics/world_olympedia_olympics__game_medal_tally.sql new file mode 100644 index 00000000..358fc064 --- /dev/null +++ b/models/world_olympedia_olympics/world_olympedia_olympics__game_medal_tally.sql @@ -0,0 +1,12 @@ +{{ config(alias="game_medal_tally", schema="world_olympedia_olympics") }} +select + safe_cast(year as int64) year, + safe_cast(edition as string) edition, + safe_cast(edition_id as string) edition_id, + safe_cast(country as string) country, + safe_cast(country_noc as string) country_noc, + safe_cast(gold as int64) gold, + safe_cast(silver as int64) silver, + safe_cast(bronze as int64) bronze, + safe_cast(total as int64) total, +from `basedosdados-dev.world_olympedia_olympics_staging.game_medal_tally` as t diff --git a/models/world_olympedia_olympics/world_olympedia_olympics__result.sql b/models/world_olympedia_olympics/world_olympedia_olympics__result.sql new file mode 100644 index 00000000..ec78c306 --- /dev/null +++ b/models/world_olympedia_olympics/world_olympedia_olympics__result.sql @@ -0,0 +1,15 @@ +{{ config(alias="result", schema="world_olympedia_olympics") }} +select + safe_cast(result_id as string) result_id, + safe_cast(event_title as string) event_title, + safe_cast(edition as string) edition, + safe_cast(edition_id as string) edition_id, + safe_cast(sport as string) sport, + safe_cast(sport_url as string) sport_url, + safe_cast(result_date as string) result_date, + safe_cast(result_location as string) result_location, + safe_cast(result_participants as string) result_participants, + safe_cast(result_format as string) result_format, + safe_cast(result_detail as string) result_detail, + safe_cast(result_description as string) result_description, +from `basedosdados-dev.world_olympedia_olympics_staging.result` as t