Merge pull request #2 from fialhocoelho/develop

Develop
fialhocoelho · Jun 11, 2024 · 2aaebaa · 2aaebaa
2 parents 0355df9 + f666016
commit 2aaebaa
Show file tree

Hide file tree

Showing 10 changed files with 2,114 additions and 1,276 deletions.
diff --git a/.gitignore b/.gitignore
@@ -87,3 +87,5 @@ target/
 
 # Mypy cache
 .mypy_cache/
+src/data/nixtla_api.key
+src/models/nixtla_api.key
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,19 +1,62 @@
 data:
   processed_path: "data/processed/"
+  intermediate_path: "data/interim/"
+  raw_path: "data/raw/"
+  forecasted_path: "data/forecasted/"
+  train_folder: "santos_dataset/train/"
+  test_folder: "santos_dataset/test/"
+  train_start_date: "2021-01-01 00:00:00"
+  train_end_date: "2021-12-31 23:55:00"
+  test_start_date: "2022-01-01 00:00:00"
+  test_end_date: "2022-12-31 23:55:00"
   processed_train_df: "train_df_praticagem.csv"
   processed_test_df: "test_df_praticagem.csv"
-  intermediate_path: "data/interim/" 
+  target_freq: "1h"
+  interp_method: "linear"
+  datetime_col: "datetime"
+  round_freq: "5min"
   timegpt_fcst_file: "forecast_cache_fcst_y_validated_20240513_212354.pkl"
-  chronos_fcst_file: "chronos_forecast_cache_20240518_160948.pkl"
+  timegpt_cache_prefix: "timegpt_forecast_cache"
+  chronos_fcst_file: "chronos_forecast_cache_20240520_000009.pkl"
+  chronos_cache_prefix: "chronos_forecast_cache"
   crop_target_datetime: "2021-01-01 00:00:00"
   default_seed: 42
+features:
+  waves_palmas:
+    name: "waves_palmas"
+    list_features: ["hs","tp","ws"]
+    train_filename: "waves_palmas.parquet"
+    test_filename: "waves_palmas.parquet"
+    freq: "20min"
+    train_start_date: "2021-01-01 00:00:00"
+    train_end_date: "2021-12-31 23:55:00"
+    test_start_date: "2022-01-01 00:00:00"
+    test_end_date: "2022-12-31 23:55:00"
+  current_praticagem:
+    name: "current_praticagem"
+    list_features: ["cross_shore_current"]
+    train_filename: "current_praticagem.parquet"
+    test_filename: "current_praticagem.parquet"
+    freq: "5min"
+    train_start_date: "2021-01-01 00:00:00"
+    train_end_date: "2021-12-31 23:55:00"
+    test_start_date: "2022-01-01 00:00:00"
+    test_end_date: "2022-12-31 23:55:00"
 model:
   context_window_len: 168
-  forecast_len: 24
+  forecast_len: 48
+  shift: 48
   batch_size: 32
+  windowing_mode: "fixed"
   epochs: 500
   lr: 0.001
   beta: 0.5
   input_size: 1
   hidden_size: 64
-  train_shuffle: true
+  train_shuffle: true
+  timegpt_finetune_steps: 100
+  nixtla_api_key_path: "nixtla_api.key"
+  chronos_finetune_steps: 0
+  attempts_after_failure: 30
+  device: "cuda"
+  chronos_t5_model: "amazon/chronos-t5-large"
diff --git a/notebooks/chronos_usage.ipynb b/notebooks/chronos_usage.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run `chronos` for Santos off-shore dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:utils.nexdata:Loading config file ../config/config.yaml\n",
+      "INFO:utils.nexdata:waves_palmas train path: ../data/raw/santos_dataset/train/waves_palmas.parquet\n",
+      "INFO:utils.nexdata:waves_palmas test path: ../data/raw/santos_dataset/test/waves_palmas.parquet\n",
+      "INFO:utils.nexdata:current_praticagem train path: ../data/raw/santos_dataset/train/current_praticagem.parquet\n",
+      "INFO:utils.nexdata:current_praticagem test path: ../data/raw/santos_dataset/test/current_praticagem.parquet\n",
+      "INFO:utils.nexdata:Random seed: 42\n",
+      "INFO:utils.nexdata:Default device: cuda\n",
+      "INFO:utils.nexdata:Defining paths...\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "from chronos import ChronosPipeline\n",
+    "import numpy as np\n",
+    "\n",
+    "sys.path.append('../src/')\n",
+    "from utils.nexdata import *\n",
+    "from utils.nexutil import *\n",
+    "\n",
+    "params = NexData(nexus_folder='../')\n",
+    "set_random_seeds(params.data_params['default_seed'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Configuring models, predict and save outputs to be used to `student` model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the ChronosPipeline model from the pretrained\n",
+    "# 'amazon/chronos-t5-large' model\n",
+    "chronos_pipeline = ChronosPipeline.from_pretrained(\n",
+    "    'amazon/chronos-t5-large',\n",
+    "    device_map='cuda',\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    ")\n",
+    "\n",
+    "# Iterate over each ocean variable defined in the parameters\n",
+    "for ocean_variable in params.features.keys():\n",
+    "    print(f'Ocean variable: {ocean_variable}')\n",
+    "\n",
+    "    # Retrieve target features and experiment IDs\n",
+    "    target_features = params.features[ocean_variable]\n",
+    "    id_experiment = 'chronos_forecast_composed'\n",
+    "    id_experiment_ioa = 'chronos_ioa_composed'\n",
+    "\n",
+    "    # Load train and test data for the target feature\n",
+    "    df_train_target = pd.read_parquet(\n",
+    "        target_features['train_filepath'])\n",
+    "    df_test_target = pd.read_parquet(\n",
+    "        target_features['test_filepath'])\n",
+    "\n",
+    "    # Process the training dataframe with specified parameters\n",
+    "    df_train_processed_target = process_dataframe(\n",
+    "        df_train_target,\n",
+    "        target_features['train_start_date'],\n",
+    "        target_features['train_end_date'],\n",
+    "        params.data_params['target_freq'],\n",
+    "        params.data_params['interp_method'],\n",
+    "        params.data_params['datetime_col'],\n",
+    "        params.data_params['round_freq'])\n",
+    "\n",
+    "    # Process the test dataframe with specified parameters\n",
+    "    df_test_processed_target = process_dataframe(\n",
+    "        df_test_target,\n",
+    "        target_features['test_start_date'],\n",
+    "        target_features['test_end_date'],\n",
+    "        params.data_params['target_freq'],\n",
+    "        params.data_params['interp_method'],\n",
+    "        params.data_params['datetime_col'],\n",
+    "        params.data_params['round_freq'])\n",
+    "\n",
+    "    # Define the context and forecast window lengths and shift\n",
+    "    context_len = params.model_params['context_window_len']\n",
+    "    forecast_len = params.model_params['forecast_len']\n",
+    "    shift = params.model_params['shift']\n",
+    "    mode = params.model_params['windowing_mode']\n",
+    "\n",
+    "    # Generate indices for the test set using the context and forecast lengths\n",
+    "    X_test_index, y_test_index = generate_indices(\n",
+    "        df_test_processed_target, context_len, forecast_len,\n",
+    "        shift, mode)\n",
+    "\n",
+    "    # Initialize DataFrames for predictions and index of agreement (IOA) values\n",
+    "    df_y_hat = pd.DataFrame()\n",
+    "    df_ioa = pd.DataFrame()\n",
+    "\n",
+    "    # Set the index for the y_hat DataFrame\n",
+    "    df_y_hat.index = np.concatenate(y_test_index)\n",
+    "    df_y_hat[params.data_params['datetime_col']] = (\n",
+    "        df_test_processed_target.loc[\n",
+    "            df_y_hat.index, params.data_params['datetime_col']\n",
+    "        ])\n",
+    "\n",
+    "    # Iterate over each target feature for prediction\n",
+    "    for target_feature in target_features['list_features']:\n",
+    "        y_hat = []\n",
+    "        ioa_list = []\n",
+    "\n",
+    "        # Add training data to improve the size of the inference data\n",
+    "        train_signal = df_train_processed_target.loc[:, \n",
+    "            target_feature].values\n",
+    "        len_X_test_index = len(X_test_index)\n",
+    "\n",
+    "        # Iterate over each test window to generate predictions\n",
+    "        for idx in range(len_X_test_index):\n",
+    "            # Extract test signal for the current window\n",
+    "            test_signal = df_test_processed_target.loc[\n",
+    "                X_test_index[idx], target_feature].values\n",
+    "            y_test_signal = df_test_processed_target.loc[\n",
+    "                y_test_index[idx], target_feature].values\n",
+    "\n",
+    "            # Concatenate training and test signals\n",
+    "            composed_signal = np.concatenate(\n",
+    "                (train_signal, test_signal))\n",
+    "            \n",
+    "            # Convert the composed signal to a tensor\n",
+    "            batch_context = torch.tensor(composed_signal)\n",
+    "            \n",
+    "            # Generate forecast using the Chronos pipeline\n",
+    "            forecast = chronos_pipeline.predict(\n",
+    "                batch_context, forecast_len)\n",
+    "            predictions = np.quantile(\n",
+    "                forecast.numpy(), 0.5, axis=1)\n",
+    "            \n",
+    "            # Append predictions to the y_hat list\n",
+    "            y_hat.extend(np.array(predictions[0]))\n",
+    "\n",
+    "            # Calculate the index of agreement (IOA) for the predictions\n",
+    "            ioa = calculate_ioa(\n",
+    "                y_test_signal, np.array(predictions[0]))\n",
+    "            ioa_list.append(ioa)\n",
+    "\n",
+    "            # Print the progress and IOA value for the current window\n",
+    "            print(f'Window {idx+1} from {len_X_test_index} | '\n",
+    "                    f'target feature: {target_feature} |  ioa: {round(ioa,3)}')\n",
+    "\n",
+    "        # Store the predictions and IOA values in the DataFrames\n",
+    "        df_y_hat[target_feature] = y_hat\n",
+    "        df_ioa[target_feature] = ioa_list\n",
+    "\n",
+    "    # Save the predictions DataFrame to a parquet file\n",
+    "    filename = os.path.join(\n",
+    "        params.forecasted_dir,\n",
+    "        f\"{target_features['name']}_{id_experiment}_\"\n",
+    "        f\"{params.timestamp}.pkl\")\n",
+    "    df_y_hat.to_parquet(filename)\n",
+    "\n",
+    "    # Save the IOA DataFrame to a parquet file\n",
+    "    filename_ioa = os.path.join(\n",
+    "        params.forecasted_dir,\n",
+    "        f\"{ocean_variable}_{id_experiment_ioa}_\"\n",
+    "        f\"{params.timestamp}.pkl\")\n",
+    "    df_ioa.to_parquet(filename_ioa)\n",
+    "\n",
+    "    # Print the file paths of the saved files\n",
+    "    print(filename)\n",
+    "    print(filename_ioa)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chronos",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}