Skip to content

Commit

Permalink
Adicionar dataset world_olympedia_olympics
Browse files Browse the repository at this point in the history
  • Loading branch information
Winzen committed Jul 17, 2024
1 parent 864cfb9 commit 4dabad5
Show file tree
Hide file tree
Showing 9 changed files with 568 additions and 0 deletions.
3 changes: 3 additions & 0 deletions dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,9 @@ models:
world_oecd_public_finance:
+materialized: table
+schema: world_oecd_public_finance
world_olympedia_olympics:
+materialized: table
+schema: world_olympedia_olympics
world_wb_mides:
+materialized: table
+schema: world_wb_mides
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Download datasets (Necessario chave api kaggle)\n",
"\n",
"- Crie uma conta em [Kaggle](https://www.kaggle.com/)\n",
"- Settings\n",
"- Api\n",
"- Create New Token"
],
"metadata": {
"id": "DdcVtltbUNlc"
}
},
{
"cell_type": "code",
"source": [
"from IPython.display import clear_output\n",
"import pandas as pd\n",
"from datetime import datetime\n",
"import re\n",
"import numpy as np\n",
"import os\n",
"\n",
"! pip install -q opendatasets\n",
"clear_output()\n",
"\n",
"import opendatasets as od\n",
"\n",
"od.download('https://www.kaggle.com/datasets/josephcheng123456/olympic-historical-dataset-from-olympediaorg')"
],
"metadata": {
"id": "rtDRFJrQT2sj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Codigos"
],
"metadata": {
"id": "HdWEm92kP4QX"
}
},
{
"cell_type": "markdown",
"source": [
"## imports"
],
"metadata": {
"id": "5Spt1SLjgu50"
}
},
{
"cell_type": "code",
"source": [
"def conv_data(valor: str) -> str|None:\n",
" try:\n",
" data_datetime = datetime.strptime(valor, \"%d %B %Y\")\n",
" return data_datetime\n",
" except:\n",
" return None\n",
"\n",
"def get_year(valor: str) -> str|None:\n",
" try:\n",
" year = re.findall(r'\\d{4}', valor)[0]\n",
" return np.int64(year)\n",
" except:\n",
" return pd.NA"
],
"metadata": {
"id": "Jw9dHrsn6bho"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## athlete_bio"
],
"metadata": {
"id": "iFii9f1GewTE"
}
},
{
"cell_type": "code",
"source": [
"from datetime import datetime\n",
"import re\n",
"import numpy as np\n",
"import os\n",
"\n",
"\n",
"def conv_data(valor: str) -> str|None:\n",
" try:\n",
" data_datetime = datetime.strptime(valor, \"%d %B %Y\")\n",
" return data_datetime\n",
" except:\n",
" return None\n",
"\n",
"def get_year(valor: str) -> str|None:\n",
" try:\n",
" year = re.findall(r'\\d{4}', valor)[0]\n",
" return np.int64(year)\n",
" except:\n",
" return pd.NA\n",
"\n",
"dtypes = {\n",
" \"athlete_id\": str,\n",
" \"name\": str,\n",
" \"sex\": str,\n",
" \"born\": str,\n",
" \"height\": float,\n",
" \"country\": str,\n",
" \"country_noc\": str,\n",
" \"description\": str,\n",
" \"special_notes\": str\n",
" }\n",
"\n",
"df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Athlete_Bio.csv\",\n",
" dtype=dtypes)\n",
"\n",
"df.weight = pd.to_numeric(df.weight, errors='coerce')\n",
"\n",
"tdata = df['born'].apply(conv_data)\n",
"anos = df['born'].apply(get_year)\n",
"\n",
"tdata = tdata.dt.strftime('%Y-%m-%d')\n",
"df.born = tdata\n",
"\n",
"df[\"year_born\"] = anos\n",
"\n",
"os.makedirs(\"output\", exist_ok=True)\n",
"\n",
"df.to_csv(\"output/athlete_bio.csv\", index=False)"
],
"metadata": {
"id": "dtuptL0DZ3dL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## athlete_event_result"
],
"metadata": {
"id": "q_GAqd9pezNj"
}
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Athlete_Event_Results.csv\", dtype=str)\n",
"\n",
"os.makedirs(\"output\", exist_ok=True)\n",
"\n",
"df.to_csv(\"/content/output/athlete_event_result.csv\", index=False)"
],
"metadata": {
"id": "Jp7kH493eVnf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## game_medal_tally"
],
"metadata": {
"id": "7H7N6J8bgVfc"
}
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Games_Medal_Tally.csv\",\n",
" dtype={\"edition_id\": str})\n",
"\n",
"os.makedirs(\"output\", exist_ok=True)\n",
"\n",
"df.to_csv(\"output/game_medal_tally.csv\", index=False)"
],
"metadata": {
"id": "qY7T15cRgd7-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## result"
],
"metadata": {
"id": "VfhzG5k3xo5F"
}
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympic_Results.csv\",\n",
" dtype=str)\n",
"\n",
"os.makedirs(\"output\", exist_ok=True)\n",
"\n",
"df.to_csv(\"output/result.csv\", index=False)"
],
"metadata": {
"id": "KG0GjZUoxrzV"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## country"
],
"metadata": {
"id": "PgOFkMmD2oYW"
}
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympics_Country.csv\",\n",
" dtype=str)\n",
"\n",
"os.makedirs(\"output\", exist_ok=True)\n",
"\n",
"df.to_csv(\"output/country.csv\", index=False)"
],
"metadata": {
"id": "jVqnhaHklYIm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## game"
],
"metadata": {
"id": "3PJoF3tQ6WHm"
}
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv(\"/content/olympic-historical-dataset-from-olympediaorg/Olympics_Games.csv\",\n",
" dtype={\"edition_id\": str})\n",
"\n",
"start_tdata = df.start_date.str.replace(\" 2021\", \"\") + \" \" + df.year.astype(str) # Uma das datas já se encontra com o ano de 2021, necessario remoção\n",
"end_tdata = df.end_date.str.replace(\" 2021\", \"\") + \" \" + df.year.astype(str)\n",
"\n",
"start_tdata = start_tdata.apply(conv_data)\n",
"end_tdata = end_tdata.apply(conv_data)\n",
"\n",
"df[\"start_date\"] = start_tdata\n",
"df[\"end_date\"] = end_tdata\n",
"\n",
"os.makedirs(\"output\", exist_ok=True)\n",
"\n",
"df.to_csv(\"output/game.csv\", index=False)"
],
"metadata": {
"id": "8iVD-Xom6Xp_"
},
"execution_count": null,
"outputs": []
}
]
}
Loading

0 comments on commit 4dabad5

Please sign in to comment.