diff --git a/data_analytics/data_analysis.py b/data_analytics/data_analysis.py index 20bc30414..88523a5c6 100644 --- a/data_analytics/data_analysis.py +++ b/data_analytics/data_analysis.py @@ -2,6 +2,7 @@ import pandas as pd from sklearn.cluster import KMeans + class DataAnalysis: def __init__(self, data): self.data = data @@ -9,16 +10,18 @@ def __init__(self, data): def analyze_data(self): # Perform data cleaning and preprocessing self.data = self.data.dropna() - self.data = pd.get_dummies(self.data, columns=['transaction_type']) + self.data = pd.get_dummies(self.data, columns=["transaction_type"]) # Perform data analysis - kmeans = KMeans(n_clusters=3, random_state=0).fit(self.data[['amount', 'frequency']]) - self.data['cluster'] = kmeans.labels_ + kmeans = KMeans(n_clusters=3, random_state=0).fit( + self.data[["amount", "frequency"]] + ) + self.data["cluster"] = kmeans.labels_ # Perform statistical analysis summary_stats = self.data.describe() - summary_stats.loc['count'] = len(self.data) - summary_stats.loc['mean'] = np.mean(self.data) - summary_stats.loc['std'] = np.std(self.data) + summary_stats.loc["count"] = len(self.data) + summary_stats.loc["mean"] = np.mean(self.data) + summary_stats.loc["std"] = np.std(self.data) return summary_stats