Merge pull request #8 from arfogg/collaborator_edits

Collaborator edits
arfogg · Aug 28, 2024 · aaee698 · aaee698
2 parents 7dafc93 + 5254684
commit aaee698
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<h1 align="center">bi_multi_variate_eva</h1> 
+<h1 align="center">:bar_chart: bi_multi_variate_eva :chart_with_upwards_trend:</h1> 
 
 [![Downloads](https://img.shields.io/github/downloads/arfogg/bi_multi_variate_eva/total.svg)](#)
 [![GitHub release](https://img.shields.io/github/v/release/arfogg/bi_multi_variate_eva)](#)
@@ -17,9 +17,6 @@ Python package to run bivariate and multivariate extreme value analysis on gener
 
 **Support:** please [create an issue](https://github.com/arfogg/bi_multi_variate_eva/issues) or contact [arfogg](https://github.com/arfogg) directly. Any input on the code / issues found are greatly appreciated and will help to improve the software.
 
-## Ongoing tasks
-- [ ] Include Dáire acknowledgement statement
-
 ## Table of Contents
 - [Required Packages](#required-packages)
 - [Installing the code](#installing-the-code)

diff --git a/src/bivariate/calculate_return_periods_values.py b/src/bivariate/calculate_return_periods_values.py
@@ -25,9 +25,8 @@ def calculate_return_period(copula, sample_grid,
     ----------
     copula : copulas copula
         Copula that has been fit to some data
-    sample_grid : pd.DataFrame
-        Two columns with names same as copula, containing x and y values
-        to find the return period for.
+    sample_grid : np.array
+        Grid containing x and y values to find the return period for.
     block_size : pd.Timedelta, optional
         Size over which block maxima have been found. The default
         is pd.to_timedelta("365.2425D").
@@ -319,7 +318,7 @@ def plot_return_period_as_function_x_y(copula,
                                            ci_percentiles=ci_percentiles)
 
     # Initialise plot
-    fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(24, 6))
+    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(9, 21))
 
     # ----- RETURN PERIOD -----
     rp_cbar_norm = colors.LogNorm(vmin=np.quantile(shaped_return_period, 0.1),

diff --git a/src/bivariate/determine_AD_AI.py b/src/bivariate/determine_AD_AI.py
@@ -160,7 +160,7 @@ def plot_extremal_dependence_coefficient(x_data, y_data, x_bs_um, y_bs_um,
 
     # Formatting
     ax_edc.set_xlabel("Quantiles", fontsize=fontsize)
-    ax_edc.set_ylabel("Extremal Dependence Coefficient, $\chi$",
+    ax_edc.set_ylabel("Extremal Dependence Coefficient, $\chi_{u}$",
                       fontsize=fontsize)
     for label in (ax_edc.get_xticklabels() + ax_edc.get_yticklabels()):
         label.set_fontsize(fontsize)

diff --git a/src/bivariate/fit_copula_to_extremes.py b/src/bivariate/fit_copula_to_extremes.py
@@ -153,3 +153,56 @@ def qualitative_copula_fit_check_bivariate(x_extremes, y_extremes,
     ax[2].set_ylim(ax[0].get_ylim())
 
     return fig, ax
+
+
+def qualitative_copula_fit_check_bivariate_scatter(x_extremes, y_extremes,
+                                                   x_sample, y_sample,
+                                                   x_name, y_name):
+    """
+    Function to do a qualitative diagnostic plot for copula fit.
+
+    Parameters
+    ----------
+    x_extremes : np.array or pd.Series
+        Observed x extremes (magnitude).
+    y_extremes : np.array or pd.Series
+        Observed y extremes (magnitude).
+    x_sample : np.array or pd.Series
+        Random copula sample (x) in data scale.
+    y_sample : np.array or pd.Series
+        Random copula sample (y) in data scale.
+    x_name : string
+        Name for x.
+    y_name : string
+        Name for y.
+
+    Returns
+    -------
+    fig : matplotlib figure
+        Diagnostic figure.
+    ax : array of matplotlib axes
+        Axes from fig.
+
+    """
+
+    fontsize = 15
+    fig, ax = plt.subplots(figsize=(10, 10))
+
+    # Both both sets of component-wise maxima as a scatter plot
+    ax.plot(x_extremes, y_extremes, marker='^', color="indigo",
+            label=str(round(len(x_extremes))) + ' Observations',
+            fillstyle='none', linewidth=0., markersize=fontsize)
+    ax.plot(x_sample, y_sample, marker='*', color="darkgoldenrod",
+            label=str(round(len(x_sample))) + " Copula Samples",
+            fillstyle='none', linewidth=0., markersize=fontsize)
+
+    # Some decor
+    ax.set_xlabel(x_name, fontsize=fontsize)
+    ax.set_ylabel(y_name, fontsize=fontsize)
+    for label in (ax.get_xticklabels() + ax.get_yticklabels()):
+        label.set_fontsize(fontsize)
+
+    ax.set_title('Component-wise maxima', fontsize=fontsize)
+    ax.legend(fontsize=fontsize)
+
+    return fig, ax
diff --git a/src/bivariate/gevd_fitter.py b/src/bivariate/gevd_fitter.py
@@ -21,7 +21,8 @@ class gevd_fitter():
     on input extrema.
     """
 
-    def __init__(self, extremes, dist=None, fit_guess={}):
+    def __init__(self, extremes, dist=None, fit_guess={},
+                 shape_threshold=0.005):
         """
         Initialise the gevd_fitter class. Fits a GEVD or
         Gumbel distribution.
@@ -40,6 +41,10 @@ def __init__(self, extremes, dist=None, fit_guess={}):
             for fitting the distribution. Keys 'c' for shape,
             'scale', and 'loc' for location. The default is
             {}.
+        shape_threshold : float, optional
+            A genextreme distribution is fitted. If the absolute value of
+            the resulting shape parameter is less than or equal to this value,
+            a gumbel_r distribution is returned instead.
 
         Returns
         -------
@@ -68,7 +73,7 @@ def __init__(self, extremes, dist=None, fit_guess={}):
                             'scale': self.scale,
                             'loc': self.location}
 
-    def fit_model(self, dist=None, fit_guess={}):
+    def fit_model(self, dist=None, fit_guess={}, shape_threshold=0.005):
         """
         Fit a GEVD or Gumbel to the parsed
         extrema.
@@ -83,8 +88,11 @@ def fit_model(self, dist=None, fit_guess={}):
         fit_guess : dictionary, optional
             Dictionary containing guess initial parameters
             for fitting the distribution. Keys 'c' for shape,
-            'scale', and 'loc' for location. The default is
-            {}.
+            'scale', and 'loc' for location. The default is {}.
+        shape_threshold : float, optional
+            A genextreme distribution is fitted. If the absolute value of
+            the resulting shape parameter is less than or equal to this value,
+            a gumbel_r distribution is returned instead.
 
         Returns
         -------
@@ -114,6 +122,7 @@ def fit_model(self, dist=None, fit_guess={}):
             elif self.distribution_name == 'gumbel_r':
                 fitted_params = self.distribution.fit(self.extremes,
                                                       **fit_guess)
+
         # Freeze the fitted model
         self.frozen_dist = self.distribution(*fitted_params)
 
@@ -133,43 +142,36 @@ def fit_model(self, dist=None, fit_guess={}):
         # Assign AIC
         self.aic = self.akaike_info_criterion(self.extremes, self.frozen_dist)
 
-    def select_distribution(self):
+    def select_distribution(self, shape_threshold=0.005):
         """
         Choose the best fitting distribution
         based on which has the lowest AIC.
 
         Parameters
         ----------
-        None.
+        shape_threshold : float, optional
+            A genextreme distribution is fitted. If the absolute value of
+            the resulting shape parameter is less than or equal to this value,
+            a gumbel_r distribution is returned instead.
 
         Returns
         -------
         None.
 
         """
 
-        # Define loop lists
-        distributions = [genextreme, gumbel_r]
-        names = ['genextreme', 'gumbel_r']
-        aic_arr = []
-
-        for dist in distributions:
-            # Fit the model
-            params = dist.fit(self.extremes)
+        # Fit GEVD, and see what the shape value is
+        shape_, location, scale = genextreme.fit(self.extremes)
 
-            # Freeze distribution
-            frozen = dist(*params)
-
-            # Calculate AIC
-            aic = self.akaike_info_criterion(self.extremes, frozen)
-            aic_arr.append(aic)
-
-        # Find model with lowest AIC
-        min_index, = np.where(aic_arr == np.min(aic_arr))
-
-        # Assign the selected distribution to the class
-        self.distribution_name = names[min_index[0]]
-        self.distribution = distributions[min_index[0]]
+        # Assess the magnitude of the shape parameter
+        if abs(shape_) > shape_threshold:
+            # Shape is large enough, genextreme is returned
+            self.distribution_name = 'genextreme'
+            self.distribution = genextreme
+        else:
+            # Shape is small, so a Gumbel is likely a better fit
+            self.distribution_name = 'gumbel_r'
+            self.distribution = gumbel_r
 
     def akaike_info_criterion(self, data, model):
         """

diff --git a/src/bivariate/return_period_plot_1d.py b/src/bivariate/return_period_plot_1d.py
@@ -138,7 +138,9 @@ def return_period_plot(gevd_fitter, bootstrap_gevd_fit, block_size, data_tag,
                   ' observed at least once\nper return period (' +
                   str(data_units_fm) + ')')
     ax.set_xscale('log')
-    ax.legend(loc='lower right')
+    # Best possible legend position, limiting to bottom 85% vertically to
+    #   leave room for axes labels
+    ax.legend(loc='best', bbox_to_anchor=(0, 0, 1, 0.85))
     ax.set_title('Return Period Plot')
 
     return ax
@@ -270,11 +272,10 @@ def calculate_return_period_empirical(data, block_size):
     Function to calculate the return period of provided
     extrema based on exceedance probability.
 
-    Pr_exceedance = 1 - (rank / (n + 1) )
-    rank - the ranking of the ordered data
-    n - number of data points
-
-    tau = 1 / (Pr_exceedance * n_ex_per_year)
+    tau = ( (N + 1) / rank ) / n
+    rank - rank of data in descending order
+    N - number of data points
+    n - number of blocks per year
 
     Parameters
     ----------
@@ -291,17 +292,14 @@ def calculate_return_period_empirical(data, block_size):
 
     """
 
-    # Calculate the number of extrema per year based on block_size
-    extrema_per_year = pd.to_timedelta("365.2425D")/block_size
+    # Calculate the number of blocks per year based on block_size
+    n = pd.to_timedelta("365.2425D")/block_size
 
     # Rank the data
-    rank = stats.rankdata(data)
-
-    # Calculate exceedance probability
-    exceedance_prob = 1. - ((rank)/(data.size + 1.0))
+    rank = (len(data) - stats.rankdata(data)) + 1
 
     # Calculate Return Period
-    tau = (1.0/(exceedance_prob*extrema_per_year))
+    tau = ((len(data) + 1) / (rank)) / n
 
     return tau
 

diff --git a/src/bivariate/transform_uniform_margins.py b/src/bivariate/transform_uniform_margins.py
@@ -394,8 +394,8 @@ def plot_diagnostic(gevd_fitter, bootstrap_gevd_fit, data_tag, data_units_fm,
                                data_units_fm, bootstrap_gevd_fit)
 
     # Some decor
-    t = ax[1, 2].text(0.03, 0.90, '(f)', transform=ax[1, 2].transAxes,
-                      va='top', ha='left')
+    t = ax[1, 2].text(0.03, 0.96, '(f)', transform=ax[1, 2].transAxes,
+                      va='bottom', ha='left')
     t.set_bbox(dict(facecolor='white', alpha=0.5, edgecolor='grey'))
 
     fig.tight_layout()