diff --git a/notebook.ipynb b/notebook.ipynb
new file mode 100644
index 0000000..1775390
--- /dev/null
+++ b/notebook.ipynb
@@ -0,0 +1,1916 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to /Users/a/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n",
+ "[nltk_data] Downloading package wordnet to /Users/a/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "#Take a look on this\n",
+ "#https://medium.com/@sebastiannorena/finding-correlation-between-many-variables-multidimensional-dataset-with-python-5deb3f39ffb3\n",
+ " \n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import nltk\n",
+ "import re\n",
+ "import os\n",
+ "\n",
+ "from nltk.tokenize import RegexpTokenizer\n",
+ "from nltk.stem import WordNetLemmatizer,PorterStemmer\n",
+ "from nltk.corpus import stopwords\n",
+ "from zipfile import ZipFile\n",
+ "\n",
+ "\n",
+ "lemmatizer = WordNetLemmatizer()\n",
+ "stemmer = PorterStemmer() \n",
+ "\n",
+ "nltk.download('stopwords')\n",
+ "nltk.download('wordnet')\n",
+ "\n",
+ "\n",
+ "TEXT_COL_NAME = 'Text'\n",
+ "LABEL_COL_NAME = 'Label'\n",
+ "\n",
+ "TOXIC_LABEL = 'TOXIC'\n",
+ "HEALTHY_LABEL = 'HEALTHY'\n",
+ "\n",
+ "THRESHOLD = 0.3\n",
+ "\n",
+ "\n",
+ "DEFAULT_DISTRIBUTION = (50000,50000)\n",
+ "\n",
+ "\n",
+ "def exportCSV(df,name,distribution=DEFAULT_DISTRIBUTION):\n",
+ " dis_positive = distribution[0]\n",
+ " dis_negative = distribution[1]\n",
+ " \n",
+ " ds_positive = df[df[LABEL_COL_NAME] == TOXIC_LABEL]\n",
+ " ds_negative = df[df[LABEL_COL_NAME] == HEALTHY_LABEL]\n",
+ " \n",
+ " ds_positive_len = len(ds_positive)\n",
+ " ds_negative_len = len(ds_negative)\n",
+ " \n",
+ " i_ds_positive = np.arange(ds_positive_len)\n",
+ " np.random.shuffle(i_ds_positive)\n",
+ " \n",
+ " i_ds_negative = np.arange(ds_negative_len)\n",
+ " np.random.shuffle(i_ds_negative)\n",
+ " \n",
+ " max_positives = ds_positive_len\n",
+ " max_negatives = ds_negative_len\n",
+ " \n",
+ " if(dis_positive < max_positives):\n",
+ " max_positives = dis_positive\n",
+ "\n",
+ " if(dis_negative < max_negatives):\n",
+ " max_negatives = dis_negative\n",
+ " \n",
+ " i_ds_positive = i_ds_positive[:max_positives]\n",
+ " i_ds_negative = i_ds_negative[:max_negatives]\n",
+ " \n",
+ " out_df_positives = ds_positive.iloc[i_ds_positive]\n",
+ " out_df_negatives = ds_negative.iloc[i_ds_negative]\n",
+ " \n",
+ " \n",
+ " print(\"Total of positives: \",ds_positive_len,\"Total of negatives: \", ds_negative_len)\n",
+ " print(\"Total of positives exported: \",max_positives,\"Total of negatives exported: \", max_negatives)\n",
+ " \n",
+ " \n",
+ " out_df = pd.concat([out_df_positives,out_df_negatives])\n",
+ " out_df_len = len(out_df)\n",
+ " i_out_df = np.arange(out_df_len)\n",
+ " np.random.shuffle(i_out_df)\n",
+ " out_df = out_df.iloc[i_out_df]\n",
+ "\n",
+ " csv_name = name+'.csv'\n",
+ " zip_name = name+'.zip'\n",
+ " \n",
+ " try:\n",
+ " os.remove(csv_name)\n",
+ " except:\n",
+ " print(\"Error while deleting file \", csv_name)\n",
+ " \n",
+ " try:\n",
+ " os.remove(zip_name)\n",
+ " except:\n",
+ " print(\"Error while deleting file \", zip_name)\n",
+ " \n",
+ " out_df.to_csv(csv_name, index=False)\n",
+ " \n",
+ " zipObj = ZipFile(zip_name, 'w')\n",
+ " zipObj.write(csv_name)\n",
+ " zipObj.close()\n",
+ " \n",
+ "def setLabels(df):\n",
+ " positives = df[LABEL_COL_NAME] == 1\n",
+ " negatives = df[LABEL_COL_NAME] == 0\n",
+ " \n",
+ " df.loc[positives, LABEL_COL_NAME] = TOXIC_LABEL\n",
+ " df.loc[negatives, LABEL_COL_NAME] = HEALTHY_LABEL\n",
+ " \n",
+ " return df\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = pd.read_csv(\"./dataset.csv\")\n",
+ "validation_ds = pd.read_csv(\"./validation_dataset.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " target | \n",
+ " comment_text | \n",
+ " severe_toxicity | \n",
+ " obscene | \n",
+ " identity_attack | \n",
+ " insult | \n",
+ " threat | \n",
+ " asian | \n",
+ " atheist | \n",
+ " ... | \n",
+ " article_id | \n",
+ " rating | \n",
+ " funny | \n",
+ " wow | \n",
+ " sad | \n",
+ " likes | \n",
+ " disagree | \n",
+ " sexual_explicit | \n",
+ " identity_annotator_count | \n",
+ " toxicity_annotator_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 59848 | \n",
+ " 0.000000 | \n",
+ " This is so cool. It's like, 'would you want yo... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.00000 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 2006 | \n",
+ " rejected | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 59849 | \n",
+ " 0.000000 | \n",
+ " Thank you!! This would make my life a lot less... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.00000 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 2006 | \n",
+ " rejected | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 59852 | \n",
+ " 0.000000 | \n",
+ " This is such an urgent design problem; kudos t... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.00000 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 2006 | \n",
+ " rejected | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 59855 | \n",
+ " 0.000000 | \n",
+ " Is this something I'll be able to install on m... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.00000 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " 2006 | \n",
+ " rejected | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 59856 | \n",
+ " 0.893617 | \n",
+ " haha you guys are a bunch of losers. | \n",
+ " 0.021277 | \n",
+ " 0.0 | \n",
+ " 0.021277 | \n",
+ " 0.87234 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 2006 | \n",
+ " rejected | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 4 | \n",
+ " 47 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 45 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id target comment_text \\\n",
+ "0 59848 0.000000 This is so cool. It's like, 'would you want yo... \n",
+ "1 59849 0.000000 Thank you!! This would make my life a lot less... \n",
+ "2 59852 0.000000 This is such an urgent design problem; kudos t... \n",
+ "3 59855 0.000000 Is this something I'll be able to install on m... \n",
+ "4 59856 0.893617 haha you guys are a bunch of losers. \n",
+ "\n",
+ " severe_toxicity obscene identity_attack insult threat asian atheist \\\n",
+ "0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n",
+ "1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n",
+ "2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n",
+ "3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n",
+ "4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n",
+ "\n",
+ " ... article_id rating funny wow sad likes disagree \\\n",
+ "0 ... 2006 rejected 0 0 0 0 0 \n",
+ "1 ... 2006 rejected 0 0 0 0 0 \n",
+ "2 ... 2006 rejected 0 0 0 0 0 \n",
+ "3 ... 2006 rejected 0 0 0 0 0 \n",
+ "4 ... 2006 rejected 0 0 0 1 0 \n",
+ "\n",
+ " sexual_explicit identity_annotator_count toxicity_annotator_count \n",
+ "0 0.0 0 4 \n",
+ "1 0.0 0 4 \n",
+ "2 0.0 0 4 \n",
+ "3 0.0 0 4 \n",
+ "4 0.0 4 47 \n",
+ "\n",
+ "[5 rows x 45 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " target | \n",
+ " severe_toxicity | \n",
+ " obscene | \n",
+ " identity_attack | \n",
+ " insult | \n",
+ " threat | \n",
+ " asian | \n",
+ " atheist | \n",
+ " bisexual | \n",
+ " ... | \n",
+ " parent_id | \n",
+ " article_id | \n",
+ " funny | \n",
+ " wow | \n",
+ " sad | \n",
+ " likes | \n",
+ " disagree | \n",
+ " sexual_explicit | \n",
+ " identity_annotator_count | \n",
+ " toxicity_annotator_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 405130.000000 | \n",
+ " 405130.000000 | \n",
+ " 405130.000000 | \n",
+ " ... | \n",
+ " 1.026228e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ " 1.804874e+06 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 3.738434e+06 | \n",
+ " 1.030173e-01 | \n",
+ " 4.582099e-03 | \n",
+ " 1.387721e-02 | \n",
+ " 2.263571e-02 | \n",
+ " 8.115273e-02 | \n",
+ " 9.311271e-03 | \n",
+ " 0.011964 | \n",
+ " 0.003205 | \n",
+ " 0.001884 | \n",
+ " ... | \n",
+ " 3.722687e+06 | \n",
+ " 2.813597e+05 | \n",
+ " 2.779269e-01 | \n",
+ " 4.420696e-02 | \n",
+ " 1.091173e-01 | \n",
+ " 2.446167e+00 | \n",
+ " 5.843688e-01 | \n",
+ " 6.605974e-03 | \n",
+ " 1.439019e+00 | \n",
+ " 8.784694e+00 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2.445187e+06 | \n",
+ " 1.970757e-01 | \n",
+ " 2.286128e-02 | \n",
+ " 6.460419e-02 | \n",
+ " 7.873156e-02 | \n",
+ " 1.760657e-01 | \n",
+ " 4.942218e-02 | \n",
+ " 0.087166 | \n",
+ " 0.050193 | \n",
+ " 0.026077 | \n",
+ " ... | \n",
+ " 2.450261e+06 | \n",
+ " 1.039293e+05 | \n",
+ " 1.055313e+00 | \n",
+ " 2.449359e-01 | \n",
+ " 4.555363e-01 | \n",
+ " 4.727924e+00 | \n",
+ " 1.866589e+00 | \n",
+ " 4.529782e-02 | \n",
+ " 1.787041e+01 | \n",
+ " 4.350086e+01 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 5.984800e+04 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 6.100600e+04 | \n",
+ " 2.006000e+03 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 3.000000e+00 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 7.969752e+05 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 7.960188e+05 | \n",
+ " 1.601200e+05 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 4.000000e+00 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 5.223774e+06 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 5.222993e+06 | \n",
+ " 3.321260e+05 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 1.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 4.000000e+00 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 5.769854e+06 | \n",
+ " 1.666667e-01 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 9.090909e-02 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 5.775758e+06 | \n",
+ " 3.662370e+05 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 3.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 0.000000e+00 | \n",
+ " 6.000000e+00 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 6.334010e+06 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000e+00 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " ... | \n",
+ " 6.333965e+06 | \n",
+ " 3.995410e+05 | \n",
+ " 1.020000e+02 | \n",
+ " 2.100000e+01 | \n",
+ " 3.100000e+01 | \n",
+ " 3.000000e+02 | \n",
+ " 1.870000e+02 | \n",
+ " 1.000000e+00 | \n",
+ " 1.866000e+03 | \n",
+ " 4.936000e+03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 42 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id target severe_toxicity obscene \\\n",
+ "count 1.804874e+06 1.804874e+06 1.804874e+06 1.804874e+06 \n",
+ "mean 3.738434e+06 1.030173e-01 4.582099e-03 1.387721e-02 \n",
+ "std 2.445187e+06 1.970757e-01 2.286128e-02 6.460419e-02 \n",
+ "min 5.984800e+04 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "25% 7.969752e+05 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "50% 5.223774e+06 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "75% 5.769854e+06 1.666667e-01 0.000000e+00 0.000000e+00 \n",
+ "max 6.334010e+06 1.000000e+00 1.000000e+00 1.000000e+00 \n",
+ "\n",
+ " identity_attack insult threat asian \\\n",
+ "count 1.804874e+06 1.804874e+06 1.804874e+06 405130.000000 \n",
+ "mean 2.263571e-02 8.115273e-02 9.311271e-03 0.011964 \n",
+ "std 7.873156e-02 1.760657e-01 4.942218e-02 0.087166 \n",
+ "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 \n",
+ "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 \n",
+ "50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 \n",
+ "75% 0.000000e+00 9.090909e-02 0.000000e+00 0.000000 \n",
+ "max 1.000000e+00 1.000000e+00 1.000000e+00 1.000000 \n",
+ "\n",
+ " atheist bisexual ... parent_id article_id \\\n",
+ "count 405130.000000 405130.000000 ... 1.026228e+06 1.804874e+06 \n",
+ "mean 0.003205 0.001884 ... 3.722687e+06 2.813597e+05 \n",
+ "std 0.050193 0.026077 ... 2.450261e+06 1.039293e+05 \n",
+ "min 0.000000 0.000000 ... 6.100600e+04 2.006000e+03 \n",
+ "25% 0.000000 0.000000 ... 7.960188e+05 1.601200e+05 \n",
+ "50% 0.000000 0.000000 ... 5.222993e+06 3.321260e+05 \n",
+ "75% 0.000000 0.000000 ... 5.775758e+06 3.662370e+05 \n",
+ "max 1.000000 1.000000 ... 6.333965e+06 3.995410e+05 \n",
+ "\n",
+ " funny wow sad likes disagree \\\n",
+ "count 1.804874e+06 1.804874e+06 1.804874e+06 1.804874e+06 1.804874e+06 \n",
+ "mean 2.779269e-01 4.420696e-02 1.091173e-01 2.446167e+00 5.843688e-01 \n",
+ "std 1.055313e+00 2.449359e-01 4.555363e-01 4.727924e+00 1.866589e+00 \n",
+ "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n",
+ "50% 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n",
+ "75% 0.000000e+00 0.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00 \n",
+ "max 1.020000e+02 2.100000e+01 3.100000e+01 3.000000e+02 1.870000e+02 \n",
+ "\n",
+ " sexual_explicit identity_annotator_count toxicity_annotator_count \n",
+ "count 1.804874e+06 1.804874e+06 1.804874e+06 \n",
+ "mean 6.605974e-03 1.439019e+00 8.784694e+00 \n",
+ "std 4.529782e-02 1.787041e+01 4.350086e+01 \n",
+ "min 0.000000e+00 0.000000e+00 3.000000e+00 \n",
+ "25% 0.000000e+00 0.000000e+00 4.000000e+00 \n",
+ "50% 0.000000e+00 0.000000e+00 4.000000e+00 \n",
+ "75% 0.000000e+00 0.000000e+00 6.000000e+00 \n",
+ "max 1.000000e+00 1.866000e+03 4.936000e+03 \n",
+ "\n",
+ "[8 rows x 42 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Como ai lab solo acepta dos columnas, texto y class entonces eliminaremos todas las columnas excepto comment_text y target"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset2cols = dataset.loc[:, dataset.columns.intersection(['target','comment_text'])]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Cambiamos el orden de las columnas para que target esté después de comment text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset2cols = dataset2cols.reindex(columns=['comment_text','target'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " comment_text | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " This is so cool. It's like, 'would you want yo... | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Thank you!! This would make my life a lot less... | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " This is such an urgent design problem; kudos t... | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Is this something I'll be able to install on m... | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " haha you guys are a bunch of losers. | \n",
+ " 0.893617 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " comment_text target\n",
+ "0 This is so cool. It's like, 'would you want yo... 0.000000\n",
+ "1 Thank you!! This would make my life a lot less... 0.000000\n",
+ "2 This is such an urgent design problem; kudos t... 0.000000\n",
+ "3 Is this something I'll be able to install on m... 0.000000\n",
+ "4 haha you guys are a bunch of losers. 0.893617"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset2cols.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ahora como ai lab solo acepta valores discretos tendremos que especificar un treshold para definir nuestras dos clases: no ofensivo (0) / ofensivo (1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " comment_text | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4 | \n",
+ " haha you guys are a bunch of losers. | \n",
+ " 0.893617 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " ur a sh*tty comment. | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " hahahahahahahahhha suck it. | \n",
+ " 0.457627 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " This is a great story. Man. I wonder if the pe... | \n",
+ " 0.440000 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " It's ridiculous that these guys are being call... | \n",
+ " 0.600000 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1804852 | \n",
+ " Xi and his comrades must be smirking over Trum... | \n",
+ " 0.300000 | \n",
+ "
\n",
+ " \n",
+ " 1804856 | \n",
+ " Believing in God or not believing in God are p... | \n",
+ " 0.500000 | \n",
+ "
\n",
+ " \n",
+ " 1804857 | \n",
+ " I take your point, but I think you're shooting... | \n",
+ " 0.700000 | \n",
+ "
\n",
+ " \n",
+ " 1804866 | \n",
+ " There's no whine like Alberta whine! | \n",
+ " 0.300000 | \n",
+ "
\n",
+ " \n",
+ " 1804872 | \n",
+ " Anyone who is quoted as having the following e... | \n",
+ " 0.621212 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
266089 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " comment_text target\n",
+ "4 haha you guys are a bunch of losers. 0.893617\n",
+ "5 ur a sh*tty comment. 0.666667\n",
+ "6 hahahahahahahahhha suck it. 0.457627\n",
+ "11 This is a great story. Man. I wonder if the pe... 0.440000\n",
+ "13 It's ridiculous that these guys are being call... 0.600000\n",
+ "... ... ...\n",
+ "1804852 Xi and his comrades must be smirking over Trum... 0.300000\n",
+ "1804856 Believing in God or not believing in God are p... 0.500000\n",
+ "1804857 I take your point, but I think you're shooting... 0.700000\n",
+ "1804866 There's no whine like Alberta whine! 0.300000\n",
+ "1804872 Anyone who is quoted as having the following e... 0.621212\n",
+ "\n",
+ "[266089 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset2cols[dataset2cols.target >= THRESHOLD]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset2cols.insert(2, LABEL_COL_NAME, np.zeros(len(dataset2cols)), True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " comment_text | \n",
+ " target | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " This is so cool. It's like, 'would you want yo... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Thank you!! This would make my life a lot less... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " This is such an urgent design problem; kudos t... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Is this something I'll be able to install on m... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " haha you guys are a bunch of losers. | \n",
+ " 0.893617 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1804869 | \n",
+ " Maybe the tax on \"things\" would be collected w... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1804870 | \n",
+ " What do you call people who STILL think the di... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1804871 | \n",
+ " thank you ,,,right or wrong,,, i am following ... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1804872 | \n",
+ " Anyone who is quoted as having the following e... | \n",
+ " 0.621212 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1804873 | \n",
+ " Students defined as EBD are legally just as di... | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1804874 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " comment_text target Label\n",
+ "0 This is so cool. It's like, 'would you want yo... 0.000000 0.0\n",
+ "1 Thank you!! This would make my life a lot less... 0.000000 0.0\n",
+ "2 This is such an urgent design problem; kudos t... 0.000000 0.0\n",
+ "3 Is this something I'll be able to install on m... 0.000000 0.0\n",
+ "4 haha you guys are a bunch of losers. 0.893617 0.0\n",
+ "... ... ... ...\n",
+ "1804869 Maybe the tax on \"things\" would be collected w... 0.000000 0.0\n",
+ "1804870 What do you call people who STILL think the di... 0.000000 0.0\n",
+ "1804871 thank you ,,,right or wrong,,, i am following ... 0.000000 0.0\n",
+ "1804872 Anyone who is quoted as having the following e... 0.621212 0.0\n",
+ "1804873 Students defined as EBD are legally just as di... 0.000000 0.0\n",
+ "\n",
+ "[1804874 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset2cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset2cols[LABEL_COL_NAME].values[dataset2cols.target >= THRESHOLD] = 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 266089.000000 | \n",
+ " 266089.0 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 0.517449 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 0.186804 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 0.300000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 0.400000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 0.500000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 0.644737 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 1.000000 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target Label\n",
+ "count 266089.000000 266089.0\n",
+ "mean 0.517449 1.0\n",
+ "std 0.186804 0.0\n",
+ "min 0.300000 1.0\n",
+ "25% 0.400000 1.0\n",
+ "50% 0.500000 1.0\n",
+ "75% 0.644737 1.0\n",
+ "max 1.000000 1.0"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset2cols[dataset2cols.target >= THRESHOLD].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "finalDataset = dataset2cols.reindex(columns=['comment_text',LABEL_COL_NAME])\n",
+ "finalDataset = finalDataset.rename(columns={\"comment_text\": TEXT_COL_NAME})\n",
+ "finalDataset = setLabels(finalDataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " This is so cool. It's like, 'would you want yo... | \n",
+ " HEALTHY | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Thank you!! This would make my life a lot less... | \n",
+ " HEALTHY | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " This is such an urgent design problem; kudos t... | \n",
+ " HEALTHY | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Is this something I'll be able to install on m... | \n",
+ " HEALTHY | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " haha you guys are a bunch of losers. | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text Label\n",
+ "0 This is so cool. It's like, 'would you want yo... HEALTHY\n",
+ "1 Thank you!! This would make my life a lot less... HEALTHY\n",
+ "2 This is such an urgent design problem; kudos t... HEALTHY\n",
+ "3 Is this something I'll be able to install on m... HEALTHY\n",
+ "4 haha you guys are a bunch of losers. TOXIC"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "finalDataset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total of positives: 266089 Total of negatives: 1538785\n",
+ "Total of positives exported: 300 Total of negatives exported: 300\n",
+ "Total of positives: 266089 Total of negatives: 1538785\n",
+ "Total of positives exported: 3000 Total of negatives exported: 3000\n",
+ "Total of positives: 266089 Total of negatives: 1538785\n",
+ "Total of positives exported: 30000 Total of negatives exported: 30000\n",
+ "Total of positives: 266089 Total of negatives: 1538785\n",
+ "Total of positives exported: 50000 Total of negatives exported: 50000\n",
+ "Total of positives: 266089 Total of negatives: 1538785\n",
+ "Total of positives exported: 100000 Total of negatives exported: 100000\n",
+ "Error while deleting file train_small_100k.csv\n",
+ "Error while deleting file train_small_100k.zip\n"
+ ]
+ }
+ ],
+ "source": [
+ "exportCSV(finalDataset,'train_small_300',(300,300))\n",
+ "exportCSV(finalDataset,'train_small_3k',(3000,3000))\n",
+ "exportCSV(finalDataset,'train_small_30k',(30000,30000))\n",
+ "exportCSV(finalDataset,'train_small_50k',(50000,50000))\n",
+ "exportCSV(finalDataset,'train_small_100k',(100000,100000))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Despues de tener nuestro dataset listo y exportado vamos ahora a trabajar nuestro dataset de validacion que se encuentra acá https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/data\n",
+ "\n",
+ "Si en el dataset de validacion la clase es 2 entonces es no ofensivo\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " count | \n",
+ " hate_speech | \n",
+ " offensive_language | \n",
+ " neither | \n",
+ " class | \n",
+ " tweet | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 count hate_speech offensive_language neither class \\\n",
+ "0 0 3 0 0 3 2 \n",
+ "1 1 3 0 3 0 1 \n",
+ "2 2 3 0 3 0 1 \n",
+ "3 3 3 0 2 1 1 \n",
+ "4 4 6 0 6 0 1 \n",
+ "\n",
+ " tweet \n",
+ "0 !!! RT @mayasolovely: As a woman you shouldn't... \n",
+ "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... \n",
+ "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... \n",
+ "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... \n",
+ "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_ds.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_ds = validation_ds.loc[:, validation_ds.columns.intersection(['tweet','class'])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " class | \n",
+ " tweet | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class tweet\n",
+ "0 2 !!! RT @mayasolovely: As a woman you shouldn't...\n",
+ "1 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba...\n",
+ "2 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...\n",
+ "3 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...\n",
+ "4 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you..."
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_ds.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_ds.insert(2, LABEL_COL_NAME, np.ones(len(validation_ds)), True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " class | \n",
+ " tweet | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class tweet Label\n",
+ "0 2 !!! RT @mayasolovely: As a woman you shouldn't... 1.0\n",
+ "1 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 1.0\n",
+ "2 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 1.0\n",
+ "3 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 1.0\n",
+ "4 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 1.0"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_ds.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_ds[LABEL_COL_NAME].values[validation_ds['class'] == 2] = 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " class | \n",
+ " tweet | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " class tweet Label\n",
+ "0 2 !!! RT @mayasolovely: As a woman you shouldn't... 0.0\n",
+ "1 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 1.0\n",
+ "2 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 1.0\n",
+ "3 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 1.0\n",
+ "4 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 1.0"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_ds.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_ds = validation_ds.rename(columns={\"tweet\": TEXT_COL_NAME})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_ds = validation_ds.loc[:, validation_ds.columns.intersection([TEXT_COL_NAME,LABEL_COL_NAME])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " !!! RT @mayasolovely: As a woman you shouldn't... | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " !!!!! RT @mleew17: boy dats cold...tyga dwn ba... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text Label\n",
+ "0 !!! RT @mayasolovely: As a woman you shouldn't... 0.0\n",
+ "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 1.0\n",
+ "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 1.0\n",
+ "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 1.0\n",
+ "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 1.0"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_ds.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ahora necesitaremos limpiar nuestro texto de caracteres indeseados para eso usaremos una funcion implementada acá https://stackoverflow.com/questions/54396405/how-can-i-preprocess-nlp-text-lowercase-remove-special-characters-remove-numb\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocess(sentence):\n",
+ " sentence=str(sentence)\n",
+ " sentence = sentence.lower()\n",
+ " sentence=sentence.replace('{html}',\"\") \n",
+ " cleanr = re.compile('<.*?>')\n",
+ " cleantext = re.sub(cleanr, '', sentence)\n",
+ " rem_url=re.sub(r'http\\S+', '',cleantext)\n",
+ " rem_num = re.sub('[0-9]+', '', rem_url)\n",
+ " tokenizer = RegexpTokenizer(r'\\w+')\n",
+ " tokens = tokenizer.tokenize(rem_num)\n",
+ " return \" \".join(tokens)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " rt mayasolovely as a woman you shouldn t compl... | \n",
+ " HEALTHY | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " rt mleew boy dats cold tyga dwn bad for cuffin... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " rt urkindofbrand dawg rt sbabylife you ever fu... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " rt c_g_anderson viva_based she look like a tranny | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " rt shenikaroberts the shit you hear about me m... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text Label\n",
+ "0 rt mayasolovely as a woman you shouldn t compl... HEALTHY\n",
+ "1 rt mleew boy dats cold tyga dwn bad for cuffin... TOXIC\n",
+ "2 rt urkindofbrand dawg rt sbabylife you ever fu... TOXIC\n",
+ "3 rt c_g_anderson viva_based she look like a tranny TOXIC\n",
+ "4 rt shenikaroberts the shit you hear about me m... TOXIC"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_ds[TEXT_COL_NAME] = validation_ds[TEXT_COL_NAME].map(lambda s:preprocess(s)) \n",
+ "validation_ds = setLabels(validation_ds)\n",
+ "validation_ds.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total of positives: 20620 Total of negatives: 4163\n",
+ "Total of positives exported: 20000 Total of negatives exported: 4000\n"
+ ]
+ }
+ ],
+ "source": [
+ "exportCSV(validation_ds,'validation_small',(20000,4000))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "BOW\n",
+ "TFIDF\n",
+ "FastText\n",
+ "DistilBERT\n",
+ "BERT\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tests.ipynb b/tests.ipynb
new file mode 100644
index 0000000..06caa95
--- /dev/null
+++ b/tests.ipynb
@@ -0,0 +1,385 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Test with github"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook will take the \"validation\" file and will test it against our AI lab model, in order to know how effective our model is."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from tqdm import tqdm\n",
+ "\n",
+ "import requests\n",
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ima be home tomorrow so line up a bitch for me | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " yall hoes still play around with the wrong shi... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " binko denzy tryna hoe broski izzy | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " rt causewereguys flirting with bitches on a tw... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " watching the nigger movie menacesociety trying... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text Label\n",
+ "0 ima be home tomorrow so line up a bitch for me TOXIC\n",
+ "1 yall hoes still play around with the wrong shi... TOXIC\n",
+ "2 binko denzy tryna hoe broski izzy TOXIC\n",
+ "3 rt causewereguys flirting with bitches on a tw... TOXIC\n",
+ "4 watching the nigger movie menacesociety trying... TOXIC"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation = pd.read_csv(\"./validation.csv\")\n",
+ "\n",
+ "\n",
+ "MODELS_V1 = {\n",
+ " \"ELMO - WOLI-120k-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V1\" : {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/c9830d4c-9c75-11eb-98a5-de79e166b688\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"ELMO - WOLI-24k-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V1\" : {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/105077b0-9c93-11eb-ba53-de79e166b688\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"ELMO - WOLI-24k-T0.3-VAL - Logistic Regression using TF-IDF vectors V1\" : {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/1071863e-9c94-11eb-ba92-de79e166b688\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words\" : {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/6f575112-9ca6-11eb-aefd-de79e166b688\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"WOLI - Bert -120k FFNN using BERT vectors\": {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/762877ca-9a79-11eb-8d07-de79e166b688\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"WOLI - DistilBert -120k FFNN using DistilBERT vectors\": {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/35ae0026-9a0f-11eb-9a75-de79e166b688\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words\": {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/30d948b8-9d4c-11eb-aa0f-521c6757c414\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " }, \n",
+ " \"ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words\": {\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/03279852-9d6a-11eb-a90b-8697a6fa86bd\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words\":{\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/49fb832a-9dfa-11eb-b2b9-8697a6fa86bd\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " },\n",
+ " \"ELMO - WOLI Distilbert T3,0 100k/100k no stop words\":{\n",
+ " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/721db1da-9eee-11eb-8df4-8697a6fa86bd\",\n",
+ " \"accuracy\": None,\n",
+ " \"predicted_values\" : [],\n",
+ " }\n",
+ " \n",
+ "}\n",
+ "\n",
+ "MODELS = {\n",
+ " \"ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words\": MODELS_V1[\"ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words\"],\n",
+ " \"ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words\" : MODELS_V1[\"ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words\"],\n",
+ " \"ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words\": MODELS_V1[\"ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words\"],\n",
+ " \"ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words\": MODELS_V1[\"ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words\"],\n",
+ " \"ELMO - WOLI Distilbert T3,0 100k/100k no stop words\": MODELS_V1[\"ELMO - WOLI Distilbert T3,0 100k/100k no stop words\"]\n",
+ "}\n",
+ "\n",
+ "validation.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " jonjanke deadspin jtrimbl you sound like you r... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " the trash wheel has a twitter and googly eyes | \n",
+ " HEALTHY | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " the_red_sea shethetruththo say that shit to my... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " good they trash rt jerseyzbest niggas ate most... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " neva been a bitch nigga you don t want them pr... | \n",
+ " TOXIC | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text Label\n",
+ "0 jonjanke deadspin jtrimbl you sound like you r... TOXIC\n",
+ "1 the trash wheel has a twitter and googly eyes HEALTHY\n",
+ "2 the_red_sea shethetruththo say that shit to my... TOXIC\n",
+ "3 good they trash rt jerseyzbest niggas ate most... TOXIC\n",
+ "4 neva been a bitch nigga you don t want them pr... TOXIC"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "allLen = len(validation)\n",
+ "\n",
+ "SUB_SET_SIZE = 100\n",
+ "\n",
+ "indexes = np.arange(allLen);\n",
+ "np.random.shuffle(indexes)\n",
+ "rndIdx = indexes[:SUB_SET_SIZE]\n",
+ "validation_sub_set = validation.iloc[rndIdx]\n",
+ "validation_sub_set.reset_index(inplace=True, drop=True)\n",
+ "validation_sub_set.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def getAccuracy(predicted_values):\n",
+ " errors_array = []\n",
+ " for index, row in validation_sub_set.iterrows():\n",
+ " text = row['Text']\n",
+ " label = row['Label']\n",
+ " predicted = predicted_values[index]\n",
+ " if (label != predicted == 'HEALTHY'):\n",
+ " errors_array.append([text,label,predicted])\n",
+ "\n",
+ " errors_df = pd.DataFrame(errors_array,columns=['Text','Expected','Received'])\n",
+ " errorsLen = len(errors_array)\n",
+ " subsetLen = len(validation_sub_set)\n",
+ " accuracy = 100 - ((errorsLen / subsetLen) * 100)\n",
+ " return accuracy\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 500/500 [04:01<00:00, 2.34it/s]"
+ ]
+ }
+ ],
+ "source": [
+ "items = MODELS.items();\n",
+ "allItemsLen = len(items) * len(validation_sub_set);\n",
+ "headers = {'accept':'application/json','Content-Type': 'application/json'}\n",
+ "pbar = tqdm(total=allItemsLen)\n",
+ "# response example {'processing_time': 0.024, 'confidence_score': 0.983, 'result': '1.0'}\n",
+ "for name, modelData in items:\n",
+ " predicted_values = []\n",
+ " for index, row in validation_sub_set.iterrows():\n",
+ " url = modelData[\"url\"]\n",
+ " data = '{\"text\":\"'+row['Text']+'\"}'\n",
+ " response = requests.post(url, headers=headers, data=data)\n",
+ " response = response.json()\n",
+ " predicted_value = response['result']\n",
+ " \n",
+ " if(predicted_value == \"0.0\"):\n",
+ " predicted_value = 'HEALTHY'\n",
+ " elif(predicted_value == \"1.0\"):\n",
+ " predicted_value = 'TOXIC'\n",
+ " \n",
+ " predicted_values.append(predicted_value)\n",
+ " pbar.update(n=1)\n",
+ " MODELS[name]['accuracy'] = getAccuracy(predicted_values)\n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we will verify if our model is effective with our validation data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/30d948b8-9d4c-11eb-aa0f-521c6757c414',\n",
+ " 'accuracy': 73.0,\n",
+ " 'predicted_values': []},\n",
+ " 'ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/6f575112-9ca6-11eb-aefd-de79e166b688',\n",
+ " 'accuracy': 73.0,\n",
+ " 'predicted_values': []},\n",
+ " 'ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/03279852-9d6a-11eb-a90b-8697a6fa86bd',\n",
+ " 'accuracy': 85.0,\n",
+ " 'predicted_values': []},\n",
+ " 'ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/49fb832a-9dfa-11eb-b2b9-8697a6fa86bd',\n",
+ " 'accuracy': 88.0,\n",
+ " 'predicted_values': []},\n",
+ " 'ELMO - WOLI Distilbert T3,0 100k/100k no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/721db1da-9eee-11eb-8df4-8697a6fa86bd',\n",
+ " 'accuracy': 84.0,\n",
+ " 'predicted_values': []}}"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "MODELS"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}