Adição do dataset world_wwf_hydrosheds

basedosdados · Jan 12, 2024 · aae87dd · aae87dd
1 parent 04326cd
commit aae87dd
Show file tree

Hide file tree

Showing 6 changed files with 3,477 additions and 0 deletions.
diff --git a/dbt_project.yml b/dbt_project.yml
@@ -277,6 +277,9 @@ models:
     world_oceanos_mapeamento:
       +materialized: table
       +schema: world_oceanos_mapeamento
+    world_wwf_hydrosheds:
+      +materialized: table
+      +schema: world_wwf_hydrosheds
     test_dataset:
       +materialized: table
       +schema: test_dataset
diff --git a/models/world_wwf_hydrosheds/code/hydrosheds_atlas.ipynb b/models/world_wwf_hydrosheds/code/hydrosheds_atlas.ipynb
@@ -0,0 +1,193 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wpHkRnNHJJdE"
+      },
+      "outputs": [],
+      "source": [
+        "https://www.hydrosheds.org/products"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install pyogrio"
+      ],
+      "metadata": {
+        "id": "5b33Qz9CJSVX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from ast import main\n",
+        "import requests\n",
+        "from zipfile import ZipFile\n",
+        "from pyogrio import read_dataframe\n",
+        "import pandas as pd\n",
+        "import geopandas as gpd\n",
+        "import concurrent.futures\n",
+        "import os\n",
+        "import re\n",
+        "\n",
+        "def download_url(url, save_path, chunk_size=128):\n",
+        "    r = requests.get(url, stream=True)\n",
+        "    with open(save_path, 'wb') as fd:\n",
+        "        for chunk in r.iter_content(chunk_size=chunk_size):\n",
+        "            fd.write(chunk)\n",
+        "\n",
+        "def extrair(path, path_output):\n",
+        "  # loading the temp.zip and creating a zip object\n",
+        "  with ZipFile(path, 'r') as zObject:\n",
+        "\n",
+        "      # Extracting all the members of the zip\n",
+        "      # into a specific location.\n",
+        "      zObject.extractall(\n",
+        "          path=path_output)\n",
+        "\n",
+        "def threads_tasks(funcion, list1, list2):\n",
+        "  with concurrent.futures.ThreadPoolExecutor() as executor:\n",
+        "    future = executor.map(funcion, list1, list2)\n",
+        "\n",
+        "def tables(zip_names: list,\n",
+        "           links: list,\n",
+        "           skip: bool=False,\n",
+        "           flagment: bool=False,\n",
+        "           re_pattern: str=\"\",\n",
+        "           partition: str=\"\"):\n",
+        "\n",
+        "  if not skip:\n",
+        "\n",
+        "    threads_tasks(download_url, links, zip_names)\n",
+        "    output_folders = [folder.replace(\".zip\", \"\") + \"/\" for folder in zip_names]\n",
+        "    threads_tasks(extrair, zip_names, output_folders)\n",
+        "  folder_zip = zip_name[0].replace(\".zip\", \"\")\n",
+        "  folders_snp = [f\"{folder_zip}/\" + folder for folder in os.listdir(folder_zip) if folder.count(\"_shp\")]\n",
+        "  files_snp = [folders_snp[0] + \"/\" + folder for folder in os.listdir(folders_snp[0]) if folder.count(\".shp\")]\n",
+        "\n",
+        "  if flagment:\n",
+        "    for file in files_snp:\n",
+        "\n",
+        "      df_gio_fragment = read_dataframe(f'/content/' + file)\n",
+        "      df_gio_fragment.columns = df_gio_fragment.columns.str.lower()\n",
+        "\n",
+        "      region_name = re.findall(re_pattern, file)[0]\n",
+        "      folder_name = f\"output/{partition}=\" + region_name\n",
+        "      create_folder([folder_name])\n",
+        "\n",
+        "      df_gio_fragment.to_csv(folder_name + f\"/{region_name}.csv\", index=False)\n",
+        "\n",
+        "  else:\n",
+        "\n",
+        "    dfs = pd.concat([read_dataframe(f'/content/' + file) for file in files_snp[:1]])\n",
+        "    df_gio = gpd.GeoDataFrame(dfs)\n",
+        "    return df_gio\n",
+        "\n",
+        "def create_folder(name_folders: list) -> None:\n",
+        "\n",
+        "  for folder in name_folders:\n",
+        "    try:\n",
+        "      os.mkdir(folder)\n",
+        "    except:\n",
+        "      pass\n"
+      ],
+      "metadata": {
+        "id": "TtxthjyEJVD-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# RiverAtlas"
+      ],
+      "metadata": {
+        "id": "Ap__vD7LJ-We"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "link = [\"https://figshare.com/ndownloader/files/20087486\"]\n",
+        "zip_name = [\"Global_RiverATLAS.zip\"]\n",
+        "create_folder([\"output\", \"input\"])\n",
+        "RiverATLASTESTE = tables(zip_name, link, False, True)"
+      ],
+      "metadata": {
+        "id": "EP5KIxJbJxX2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "#BasinAtlas"
+      ],
+      "metadata": {
+        "id": "BTFtQtjBKC7e"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "link = [\"https://figshare.com/ndownloader/files/20087237\"]\n",
+        "zip_name = [\"Global_BasinATLAS.zip\"]\n",
+        "create_folder([\"output\", \"input\"])\n",
+        "BasinATLASTESTE = tables(zip_name, link, False, True,\n",
+        "                         re_pattern=\"lev(\\d*).shp\",\n",
+        "                         partition=\"level\")"
+      ],
+      "metadata": {
+        "id": "-ay26hSvJzRG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# LakeAtlas"
+      ],
+      "metadata": {
+        "id": "IgD_tMfIKFem"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "link = [\"https://figshare.com/ndownloader/files/35959547\"]\n",
+        "zip_name = [\"Global_LakeATLAS.zip\"]\n",
+        "create_folder([\"output\", \"input\"])\n",
+        "LakeATLASTESTE = tables(zip_name, link, False, True,\n",
+        "                         re_pattern=\"_([a-z]{3}_[a-z]{4}).shp\",\n",
+        "                         partition=\"region\")"
+      ],
+      "metadata": {
+        "id": "aN89ppSBJ06_"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}