From 5254684b53f79ced4dd3b9b691a35e665735b9f4 Mon Sep 17 00:00:00 2001
From: Alexandra Ruth Fogg <arfogg@cp.dias.ie>
Date: Wed, 28 Aug 2024 15:32:37 +0100
Subject: [PATCH] collaborator comments and README edits

---
 README.md                              |  5 +--
 src/bivariate/determine_AD_AI.py       |  2 +-
 src/bivariate/gevd_fitter.py           | 56 +++++++++++++-------------
 src/bivariate/return_period_plot_1d.py | 20 ++++-----
 4 files changed, 39 insertions(+), 44 deletions(-)
diff --git a/README.md b/README.md
index f05de50..d4160fd 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<h1 align="center">bi_multi_variate_eva</h1> 
+<h1 align="center">:bar_chart: bi_multi_variate_eva :chart_with_upwards_trend:</h1> 
 
 [![Downloads](https://img.shields.io/github/downloads/arfogg/bi_multi_variate_eva/total.svg)](#)
 [![GitHub release](https://img.shields.io/github/v/release/arfogg/bi_multi_variate_eva)](#)
@@ -17,9 +17,6 @@ Python package to run bivariate and multivariate extreme value analysis on gener
 
 **Support:** please [create an issue](https://github.com/arfogg/bi_multi_variate_eva/issues) or contact [arfogg](https://github.com/arfogg) directly. Any input on the code / issues found are greatly appreciated and will help to improve the software.
 
-## Ongoing tasks
-- [ ] Include Dáire acknowledgement statement
-
 ## Table of Contents
 - [Required Packages](#required-packages)
 - [Installing the code](#installing-the-code)
diff --git a/src/bivariate/determine_AD_AI.py b/src/bivariate/determine_AD_AI.py
index 4dedfb8..5d092bb 100644
--- a/src/bivariate/determine_AD_AI.py
+++ b/src/bivariate/determine_AD_AI.py
@@ -160,7 +160,7 @@ def plot_extremal_dependence_coefficient(x_data, y_data, x_bs_um, y_bs_um,
 
     # Formatting
     ax_edc.set_xlabel("Quantiles", fontsize=fontsize)
-    ax_edc.set_ylabel("Extremal Dependence Coefficient, $\chi$",
+    ax_edc.set_ylabel("Extremal Dependence Coefficient, $\chi_{u}$",
                       fontsize=fontsize)
     for label in (ax_edc.get_xticklabels() + ax_edc.get_yticklabels()):
         label.set_fontsize(fontsize)
diff --git a/src/bivariate/gevd_fitter.py b/src/bivariate/gevd_fitter.py
index 4dbd241..a0cc430 100644
--- a/src/bivariate/gevd_fitter.py
+++ b/src/bivariate/gevd_fitter.py
@@ -21,7 +21,8 @@ class gevd_fitter():
     on input extrema.
     """
 
-    def __init__(self, extremes, dist=None, fit_guess={}):
+    def __init__(self, extremes, dist=None, fit_guess={},
+                 shape_threshold=0.005):
         """
         Initialise the gevd_fitter class. Fits a GEVD or
         Gumbel distribution.
@@ -40,6 +41,10 @@ def __init__(self, extremes, dist=None, fit_guess={}):
             for fitting the distribution. Keys 'c' for shape,
             'scale', and 'loc' for location. The default is
             {}.
+        shape_threshold : float, optional
+            A genextreme distribution is fitted. If the absolute value of
+            the resulting shape parameter is less than or equal to this value,
+            a gumbel_r distribution is returned instead.
 
         Returns
         -------
@@ -68,7 +73,7 @@ def __init__(self, extremes, dist=None, fit_guess={}):
                             'scale': self.scale,
                             'loc': self.location}
 
-    def fit_model(self, dist=None, fit_guess={}):
+    def fit_model(self, dist=None, fit_guess={}, shape_threshold=0.005):
         """
         Fit a GEVD or Gumbel to the parsed
         extrema.
@@ -83,8 +88,11 @@ def fit_model(self, dist=None, fit_guess={}):
         fit_guess : dictionary, optional
             Dictionary containing guess initial parameters
             for fitting the distribution. Keys 'c' for shape,
-            'scale', and 'loc' for location. The default is
-            {}.
+            'scale', and 'loc' for location. The default is {}.
+        shape_threshold : float, optional
+            A genextreme distribution is fitted. If the absolute value of
+            the resulting shape parameter is less than or equal to this value,
+            a gumbel_r distribution is returned instead.
 
         Returns
         -------
@@ -114,6 +122,7 @@ def fit_model(self, dist=None, fit_guess={}):
             elif self.distribution_name == 'gumbel_r':
                 fitted_params = self.distribution.fit(self.extremes,
                                                       **fit_guess)
+
         # Freeze the fitted model
         self.frozen_dist = self.distribution(*fitted_params)
 
@@ -133,14 +142,17 @@ def fit_model(self, dist=None, fit_guess={}):
         # Assign AIC
         self.aic = self.akaike_info_criterion(self.extremes, self.frozen_dist)
 
-    def select_distribution(self):
+    def select_distribution(self, shape_threshold=0.005):
         """
         Choose the best fitting distribution
         based on which has the lowest AIC.
 
         Parameters
         ----------
-        None.
+        shape_threshold : float, optional
+            A genextreme distribution is fitted. If the absolute value of
+            the resulting shape parameter is less than or equal to this value,
+            a gumbel_r distribution is returned instead.
 
         Returns
         -------
@@ -148,28 +160,18 @@ def select_distribution(self):
 
         """
 
-        # Define loop lists
-        distributions = [genextreme, gumbel_r]
-        names = ['genextreme', 'gumbel_r']
-        aic_arr = []
-
-        for dist in distributions:
-            # Fit the model
-            params = dist.fit(self.extremes)
+        # Fit GEVD, and see what the shape value is
+        shape_, location, scale = genextreme.fit(self.extremes)
 
-            # Freeze distribution
-            frozen = dist(*params)
-
-            # Calculate AIC
-            aic = self.akaike_info_criterion(self.extremes, frozen)
-            aic_arr.append(aic)
-
-        # Find model with lowest AIC
-        min_index, = np.where(aic_arr == np.min(aic_arr))
-
-        # Assign the selected distribution to the class
-        self.distribution_name = names[min_index[0]]
-        self.distribution = distributions[min_index[0]]
+        # Assess the magnitude of the shape parameter
+        if abs(shape_) > shape_threshold:
+            # Shape is large enough, genextreme is returned
+            self.distribution_name = 'genextreme'
+            self.distribution = genextreme
+        else:
+            # Shape is small, so a Gumbel is likely a better fit
+            self.distribution_name = 'gumbel_r'
+            self.distribution = gumbel_r
 
     def akaike_info_criterion(self, data, model):
         """
diff --git a/src/bivariate/return_period_plot_1d.py b/src/bivariate/return_period_plot_1d.py
index 8d26fa6..ccd4205 100644
--- a/src/bivariate/return_period_plot_1d.py
+++ b/src/bivariate/return_period_plot_1d.py
@@ -272,11 +272,10 @@ def calculate_return_period_empirical(data, block_size):
     Function to calculate the return period of provided
     extrema based on exceedance probability.
 
-    Pr_exceedance = 1 - (rank / (n + 1) )
-    rank - the ranking of the ordered data
-    n - number of data points
-
-    tau = 1 / (Pr_exceedance * n_ex_per_year)
+    tau = ( (N + 1) / rank ) / n
+    rank - rank of data in descending order
+    N - number of data points
+    n - number of blocks per year
 
     Parameters
     ----------
@@ -293,17 +292,14 @@ def calculate_return_period_empirical(data, block_size):
 
     """
 
-    # Calculate the number of extrema per year based on block_size
-    extrema_per_year = pd.to_timedelta("365.2425D")/block_size
+    # Calculate the number of blocks per year based on block_size
+    n = pd.to_timedelta("365.2425D")/block_size
 
     # Rank the data
-    rank = stats.rankdata(data)
-
-    # Calculate exceedance probability
-    exceedance_prob = 1. - ((rank)/(data.size + 1.0))
+    rank = (len(data) - stats.rankdata(data)) + 1
 
     # Calculate Return Period
-    tau = (1.0/(exceedance_prob*extrema_per_year))
+    tau = ((len(data) + 1) / (rank)) / n
 
     return tau