diff --git a/Consumer Complaint Dataset/README.md b/Consumer Complaint Dataset/README.md new file mode 100644 index 000000000..8c4f09bcc --- /dev/null +++ b/Consumer Complaint Dataset/README.md @@ -0,0 +1,52 @@ +# Consumer Complaints Text Classification + +## Overview +This project is a Natural Language Processing (NLP) task focused on classifying consumer complaints into specific financial categories. The goal is to streamline the complaint resolution process by routing complaints to the appropriate teams using an automated classification model. The project uses a dataset from the Consumer Financial Protection Bureau (CFPB) and applies a Random Forest Classifier to achieve this task. + +## Dataset + +### Context +The Consumer Financial Protection Bureau (CFPB) is a federal U.S. agency that helps mediate disputes between consumers and financial institutions. Consumers submit complaints through a web form, and these complaints are then tagged to assist in routing and resolving issues. An NLP model can automate the classification and routing of these complaints, improving efficiency over manual tagging. + +### Content +The dataset consists of complaint submissions from March 2020 to March 2021. Each submission has been labeled with one of nine financial product classes, which we consolidated into five main categories for this project: +- **Credit Reporting** +- **Debt Collection** +- **Mortgages and Loans** (including car, payday, and student loans) +- **Credit Cards** +- **Retail Banking** (covering checking/savings accounts, money transfers, etc.) + +After data cleaning, the dataset contained approximately 162,400 complaints, with a significant class imbalance (56% in the credit reporting class and the remaining classes distributed between 8% and 14%). + +### Acknowledgements +The dataset was organized by CFPB and cleaned by [halpert3](https://github.com/halpert3). + +## Project Structure +- **notebooks/** - Contains Jupyter notebooks for data processing, training, and evaluation of the model. +- **data/** - Directory to store the dataset (`complaints_processed.csv`). +- **src/** - Source code for data preprocessing, model training, and evaluation. +- **README.md** - Project overview and instructions. + +## Model +We use a Random Forest Classifier to categorize each complaint into one of the five financial classes. The text data undergoes preprocessing, and we create a TF-IDF matrix for feature extraction. The model is then trained and evaluated on this matrix. + +### Performance +The Random Forest model achieves approximately 79.6% accuracy, with a weighted F1-score of 0.80. Performance metrics are slightly varied due to class imbalance, with the model performing best on the majority class (Credit Reporting). + + +## Installation +1. Clone this repository: + ```bash + git clone https://github.com/yourusername/consumer-complaints-classification.git + ``` +2. Install the required libraries: + ```bash + pip install -r requirements.txt + ``` +3. Download and place `complaints_processed.csv` in the `ML-CaPsule/Consumer Complaint Dataset/complaints_processed.zip` directory. + +## Usage +1. Open the `ML-CaPsule/Consumer Complaint Dataset/Complaint_Classification.ipynb` notebook to view the data processing and model training steps. +2. Run the cells in the notebook to reproduce the results. + + diff --git a/Consumer Complaint Dataset/Text_Classification_Random_Forest.ipynb b/Consumer Complaint Dataset/Text_Classification_Random_Forest.ipynb new file mode 100644 index 000000000..31fa063d3 --- /dev/null +++ b/Consumer Complaint Dataset/Text_Classification_Random_Forest.ipynb @@ -0,0 +1,415 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf83ab81", + "metadata": {}, + "source": [ + "# Text Classification Using Random Forest\n", + "\n", + "In this notebook, we will perform text classification using the **Random Forest** classifier. The data is loaded from a CSV file named **complaints_processed.csv**.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4a31a634", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\anany\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Unzipping corpora\\stopwords.zip.\n" + ] + } + ], + "source": [ + "# Import necessary libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "import re\n", + "import string\n", + "import nltk\n", + "nltk.download('stopwords')\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix, classification_report\n", + "import matplotlib.pyplot as plt\n", + "import os\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2c02e2bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
productnarrative
0credit_cardpurchase order day shipping amount receive pro...
1credit_cardforwarded message date tue subject please inve...
2retail_bankingforwarded message cc sent friday pdt subject f...
3credit_reportingpayment history missing credit report speciali...
4credit_reportingpayment history missing credit report made mis...
\n", + "
" + ], + "text/plain": [ + " product narrative\n", + "0 credit_card purchase order day shipping amount receive pro...\n", + "1 credit_card forwarded message date tue subject please inve...\n", + "2 retail_banking forwarded message cc sent friday pdt subject f...\n", + "3 credit_reporting payment history missing credit report speciali...\n", + "4 credit_reporting payment history missing credit report made mis..." + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load the data\n", + "df = pd.read_csv('complaints_processed.csv', index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "78694eb0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--> There are 162421 rows and 2 columns\n", + "\n", + "===========================================================\n", + "\n", + "--> Missing Values:\n", + "\n", + " product 0\n", + "narrative 10\n", + "dtype: int64\n", + "\n", + "===========================================================\n", + "\n", + "Product Counts:\n", + "\n", + " product\n", + "credit_reporting 91179\n", + "debt_collection 23150\n", + "mortgages_and_loans 18990\n", + "credit_card 15566\n", + "retail_banking 13536\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Display dataset information\n", + "print(f'--> There are {df.shape[0]} rows and {df.shape[1]} columns')\n", + "print('\\n===========================================================\\n')\n", + "print('--> Missing Values:\\n\\n', df.isna().sum())\n", + "print('\\n===========================================================\\n')\n", + "print('Product Counts:\\n\\n', df['product'].value_counts())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f4fb7ed8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For Actual dataset:\n", + "\n", + " product\n", + "credit_reporting 56.136592\n", + "debt_collection 14.252729\n", + "mortgages_and_loans 11.692558\n", + "credit_card 9.584326\n", + "retail_banking 8.333795\n", + "Name: count, dtype: float64\n" + ] + } + ], + "source": [ + "# Drop missing values as they are minimal\n", + "df.dropna(axis=0, inplace=True)\n", + "\n", + "# Display class distribution in original dataset\n", + "print('For Actual dataset:\\n\\n', df['product'].value_counts() * 100 / len(df))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e30ffed1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample dataset:\n", + "\n", + " product\n", + "credit_reporting 56.24\n", + "debt_collection 15.16\n", + "mortgages_and_loans 11.15\n", + "credit_card 9.37\n", + "retail_banking 8.08\n", + "Name: count, dtype: float64\n" + ] + } + ], + "source": [ + "# Sampling the data for faster processing\n", + "data = df[['product', 'narrative']].sample(n=10000)\n", + "\n", + "# Display class distribution in sample dataset\n", + "print('Sample dataset:\\n\\n', data['product'].value_counts() * 100 / len(data))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a6b5f661", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a function to clean text data\n", + "stopwords = nltk.corpus.stopwords.words('english')\n", + "\n", + "def text_clean(text):\n", + " \"\"\"\n", + " This function performs the following tasks:\n", + " 1. Converts text to lowercase\n", + " 2. Removes digits\n", + " 3. Removes words with fewer than 3 characters\n", + " 4. Removes stopwords\n", + " \"\"\"\n", + " clean_words = []\n", + " word_list = text.split()\n", + " for word in word_list:\n", + " word_l = word.lower().strip()\n", + " if word_l.isalpha() and len(word_l) > 3 and word_l not in stopwords:\n", + " clean_words.append(word_l)\n", + " return clean_words\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "197e01ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of Term Document Matrix: (10000, 13197)\n" + ] + } + ], + "source": [ + "# Creating the Term Document Matrix\n", + "tfidf = TfidfVectorizer(analyzer=text_clean)\n", + "x_tfidf = tfidf.fit_transform(data['narrative'])\n", + "\n", + "# Display shape of matrix\n", + "print(\"Shape of Term Document Matrix:\", x_tfidf.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a5ab2a61-eefe-457d-8eb2-73353d112c36", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate the confusion matrix\n", + "cm = confusion_matrix(y_test, prediction, normalize='true')\n", + "\n", + "# Define the class names as per data\n", + "class_names = ['credit_card', 'credit_reporting', 'debt_collection', 'mortgages_and_loans', 'retail_banking']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "cad6b62d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Confusion Matrix:\n", + "\n", + " [[ 98 54 1 8 15]\n", + " [ 8 1097 9 4 0]\n", + " [ 4 176 139 9 2]\n", + " [ 5 54 4 156 4]\n", + " [ 13 23 5 2 110]]\n", + "\n", + "\n", + "Classification Report:\n", + "\n", + " precision recall f1-score support\n", + "\n", + " credit_card 0.77 0.56 0.64 176\n", + " credit_reporting 0.78 0.98 0.87 1118\n", + " debt_collection 0.88 0.42 0.57 330\n", + "mortgages_and_loans 0.87 0.70 0.78 223\n", + " retail_banking 0.84 0.72 0.77 153\n", + "\n", + " accuracy 0.80 2000\n", + " macro avg 0.83 0.68 0.73 2000\n", + " weighted avg 0.81 0.80 0.78 2000\n", + "\n" + ] + } + ], + "source": [ + "# Train Test Split\n", + "X_train, X_test, y_train, y_test = train_test_split(x_tfidf, data['product'], test_size=0.2, random_state=42)\n", + "\n", + "# Train RandomForest Classifier\n", + "rfc = RandomForestClassifier(n_jobs=-1)\n", + "rfc_model = rfc.fit(X_train, y_train)\n", + "\n", + "# Predict on test set\n", + "prediction = rfc_model.predict(X_test)\n", + "\n", + "# Display Confusion Matrix and Classification Report\n", + "print(\"Confusion Matrix:\\n\\n\", confusion_matrix(y_test, prediction))\n", + "print(\"\\n\")\n", + "print(\"Classification Report:\\n\\n\", classification_report(y_test, prediction))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8ce0544f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plotting the normalized confusion matrix\n", + "plt.figure(figsize=(10, 10))\n", + "plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n", + "plt.title(\"Normalized Confusion Matrix\")\n", + "plt.colorbar()\n", + "\n", + "# Set tick marks and labels\n", + "tick_marks = np.arange(len(class_names))\n", + "plt.xticks(tick_marks, class_names, rotation=90)\n", + "plt.yticks(tick_marks, class_names)\n", + "\n", + "# Add text annotations to each cell\n", + "for i in range(len(class_names)):\n", + " for j in range(len(class_names)):\n", + " plt.text(j, i, format(cm[i, j], \".2f\"), ha=\"center\", va=\"center\", color=\"white\" if cm[i, j] > 0.5 else \"black\")\n", + "\n", + "plt.ylabel('True label')\n", + "plt.xlabel('Predicted label')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0efafbf6-5c41-4415-bcf8-9c1ea430e067", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Consumer Complaint Dataset/complaints_processed.zip b/Consumer Complaint Dataset/complaints_processed.zip new file mode 100644 index 000000000..637a2e6ce Binary files /dev/null and b/Consumer Complaint Dataset/complaints_processed.zip differ