diff --git a/notebook.ipynb b/notebook.ipynb new file mode 100644 index 0000000..1775390 --- /dev/null +++ b/notebook.ipynb @@ -0,0 +1,1916 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /Users/a/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to /Users/a/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "#Take a look on this\n", + "#https://medium.com/@sebastiannorena/finding-correlation-between-many-variables-multidimensional-dataset-with-python-5deb3f39ffb3\n", + " \n", + "import pandas as pd\n", + "import numpy as np\n", + "import nltk\n", + "import re\n", + "import os\n", + "\n", + "from nltk.tokenize import RegexpTokenizer\n", + "from nltk.stem import WordNetLemmatizer,PorterStemmer\n", + "from nltk.corpus import stopwords\n", + "from zipfile import ZipFile\n", + "\n", + "\n", + "lemmatizer = WordNetLemmatizer()\n", + "stemmer = PorterStemmer() \n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "\n", + "\n", + "TEXT_COL_NAME = 'Text'\n", + "LABEL_COL_NAME = 'Label'\n", + "\n", + "TOXIC_LABEL = 'TOXIC'\n", + "HEALTHY_LABEL = 'HEALTHY'\n", + "\n", + "THRESHOLD = 0.3\n", + "\n", + "\n", + "DEFAULT_DISTRIBUTION = (50000,50000)\n", + "\n", + "\n", + "def exportCSV(df,name,distribution=DEFAULT_DISTRIBUTION):\n", + " dis_positive = distribution[0]\n", + " dis_negative = distribution[1]\n", + " \n", + " ds_positive = df[df[LABEL_COL_NAME] == TOXIC_LABEL]\n", + " ds_negative = df[df[LABEL_COL_NAME] == HEALTHY_LABEL]\n", + " \n", + " ds_positive_len = len(ds_positive)\n", + " ds_negative_len = len(ds_negative)\n", + " \n", + " i_ds_positive = np.arange(ds_positive_len)\n", + " np.random.shuffle(i_ds_positive)\n", + " \n", + " i_ds_negative = np.arange(ds_negative_len)\n", + " np.random.shuffle(i_ds_negative)\n", + " \n", + " max_positives = ds_positive_len\n", + " max_negatives = ds_negative_len\n", + " \n", + " if(dis_positive < max_positives):\n", + " max_positives = dis_positive\n", + "\n", + " if(dis_negative < max_negatives):\n", + " max_negatives = dis_negative\n", + " \n", + " i_ds_positive = i_ds_positive[:max_positives]\n", + " i_ds_negative = i_ds_negative[:max_negatives]\n", + " \n", + " out_df_positives = ds_positive.iloc[i_ds_positive]\n", + " out_df_negatives = ds_negative.iloc[i_ds_negative]\n", + " \n", + " \n", + " print(\"Total of positives: \",ds_positive_len,\"Total of negatives: \", ds_negative_len)\n", + " print(\"Total of positives exported: \",max_positives,\"Total of negatives exported: \", max_negatives)\n", + " \n", + " \n", + " out_df = pd.concat([out_df_positives,out_df_negatives])\n", + " out_df_len = len(out_df)\n", + " i_out_df = np.arange(out_df_len)\n", + " np.random.shuffle(i_out_df)\n", + " out_df = out_df.iloc[i_out_df]\n", + "\n", + " csv_name = name+'.csv'\n", + " zip_name = name+'.zip'\n", + " \n", + " try:\n", + " os.remove(csv_name)\n", + " except:\n", + " print(\"Error while deleting file \", csv_name)\n", + " \n", + " try:\n", + " os.remove(zip_name)\n", + " except:\n", + " print(\"Error while deleting file \", zip_name)\n", + " \n", + " out_df.to_csv(csv_name, index=False)\n", + " \n", + " zipObj = ZipFile(zip_name, 'w')\n", + " zipObj.write(csv_name)\n", + " zipObj.close()\n", + " \n", + "def setLabels(df):\n", + " positives = df[LABEL_COL_NAME] == 1\n", + " negatives = df[LABEL_COL_NAME] == 0\n", + " \n", + " df.loc[positives, LABEL_COL_NAME] = TOXIC_LABEL\n", + " df.loc[negatives, LABEL_COL_NAME] = HEALTHY_LABEL\n", + " \n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = pd.read_csv(\"./dataset.csv\")\n", + "validation_ds = pd.read_csv(\"./validation_dataset.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtargetcomment_textsevere_toxicityobsceneidentity_attackinsultthreatasianatheist...article_idratingfunnywowsadlikesdisagreesexual_explicitidentity_annotator_counttoxicity_annotator_count
0598480.000000This is so cool. It's like, 'would you want yo...0.0000000.00.0000000.000000.0NaNNaN...2006rejected000000.004
1598490.000000Thank you!! This would make my life a lot less...0.0000000.00.0000000.000000.0NaNNaN...2006rejected000000.004
2598520.000000This is such an urgent design problem; kudos t...0.0000000.00.0000000.000000.0NaNNaN...2006rejected000000.004
3598550.000000Is this something I'll be able to install on m...0.0000000.00.0000000.000000.0NaNNaN...2006rejected000000.004
4598560.893617haha you guys are a bunch of losers.0.0212770.00.0212770.872340.00.00.0...2006rejected000100.0447
\n", + "

5 rows × 45 columns

\n", + "
" + ], + "text/plain": [ + " id target comment_text \\\n", + "0 59848 0.000000 This is so cool. It's like, 'would you want yo... \n", + "1 59849 0.000000 Thank you!! This would make my life a lot less... \n", + "2 59852 0.000000 This is such an urgent design problem; kudos t... \n", + "3 59855 0.000000 Is this something I'll be able to install on m... \n", + "4 59856 0.893617 haha you guys are a bunch of losers. \n", + "\n", + " severe_toxicity obscene identity_attack insult threat asian atheist \\\n", + "0 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n", + "1 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n", + "2 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n", + "3 0.000000 0.0 0.000000 0.00000 0.0 NaN NaN \n", + "4 0.021277 0.0 0.021277 0.87234 0.0 0.0 0.0 \n", + "\n", + " ... article_id rating funny wow sad likes disagree \\\n", + "0 ... 2006 rejected 0 0 0 0 0 \n", + "1 ... 2006 rejected 0 0 0 0 0 \n", + "2 ... 2006 rejected 0 0 0 0 0 \n", + "3 ... 2006 rejected 0 0 0 0 0 \n", + "4 ... 2006 rejected 0 0 0 1 0 \n", + "\n", + " sexual_explicit identity_annotator_count toxicity_annotator_count \n", + "0 0.0 0 4 \n", + "1 0.0 0 4 \n", + "2 0.0 0 4 \n", + "3 0.0 0 4 \n", + "4 0.0 4 47 \n", + "\n", + "[5 rows x 45 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtargetsevere_toxicityobsceneidentity_attackinsultthreatasianatheistbisexual...parent_idarticle_idfunnywowsadlikesdisagreesexual_explicitidentity_annotator_counttoxicity_annotator_count
count1.804874e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+06405130.000000405130.000000405130.000000...1.026228e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+061.804874e+06
mean3.738434e+061.030173e-014.582099e-031.387721e-022.263571e-028.115273e-029.311271e-030.0119640.0032050.001884...3.722687e+062.813597e+052.779269e-014.420696e-021.091173e-012.446167e+005.843688e-016.605974e-031.439019e+008.784694e+00
std2.445187e+061.970757e-012.286128e-026.460419e-027.873156e-021.760657e-014.942218e-020.0871660.0501930.026077...2.450261e+061.039293e+051.055313e+002.449359e-014.555363e-014.727924e+001.866589e+004.529782e-021.787041e+014.350086e+01
min5.984800e+040.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0000000.000000...6.100600e+042.006000e+030.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+003.000000e+00
25%7.969752e+050.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0000000.000000...7.960188e+051.601200e+050.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+004.000000e+00
50%5.223774e+060.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+000.0000000.0000000.000000...5.222993e+063.321260e+050.000000e+000.000000e+000.000000e+001.000000e+000.000000e+000.000000e+000.000000e+004.000000e+00
75%5.769854e+061.666667e-010.000000e+000.000000e+000.000000e+009.090909e-020.000000e+000.0000000.0000000.000000...5.775758e+063.662370e+050.000000e+000.000000e+000.000000e+003.000000e+000.000000e+000.000000e+000.000000e+006.000000e+00
max6.334010e+061.000000e+001.000000e+001.000000e+001.000000e+001.000000e+001.000000e+001.0000001.0000001.000000...6.333965e+063.995410e+051.020000e+022.100000e+013.100000e+013.000000e+021.870000e+021.000000e+001.866000e+034.936000e+03
\n", + "

8 rows × 42 columns

\n", + "
" + ], + "text/plain": [ + " id target severe_toxicity obscene \\\n", + "count 1.804874e+06 1.804874e+06 1.804874e+06 1.804874e+06 \n", + "mean 3.738434e+06 1.030173e-01 4.582099e-03 1.387721e-02 \n", + "std 2.445187e+06 1.970757e-01 2.286128e-02 6.460419e-02 \n", + "min 5.984800e+04 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "25% 7.969752e+05 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "50% 5.223774e+06 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "75% 5.769854e+06 1.666667e-01 0.000000e+00 0.000000e+00 \n", + "max 6.334010e+06 1.000000e+00 1.000000e+00 1.000000e+00 \n", + "\n", + " identity_attack insult threat asian \\\n", + "count 1.804874e+06 1.804874e+06 1.804874e+06 405130.000000 \n", + "mean 2.263571e-02 8.115273e-02 9.311271e-03 0.011964 \n", + "std 7.873156e-02 1.760657e-01 4.942218e-02 0.087166 \n", + "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 \n", + "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 \n", + "50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 \n", + "75% 0.000000e+00 9.090909e-02 0.000000e+00 0.000000 \n", + "max 1.000000e+00 1.000000e+00 1.000000e+00 1.000000 \n", + "\n", + " atheist bisexual ... parent_id article_id \\\n", + "count 405130.000000 405130.000000 ... 1.026228e+06 1.804874e+06 \n", + "mean 0.003205 0.001884 ... 3.722687e+06 2.813597e+05 \n", + "std 0.050193 0.026077 ... 2.450261e+06 1.039293e+05 \n", + "min 0.000000 0.000000 ... 6.100600e+04 2.006000e+03 \n", + "25% 0.000000 0.000000 ... 7.960188e+05 1.601200e+05 \n", + "50% 0.000000 0.000000 ... 5.222993e+06 3.321260e+05 \n", + "75% 0.000000 0.000000 ... 5.775758e+06 3.662370e+05 \n", + "max 1.000000 1.000000 ... 6.333965e+06 3.995410e+05 \n", + "\n", + " funny wow sad likes disagree \\\n", + "count 1.804874e+06 1.804874e+06 1.804874e+06 1.804874e+06 1.804874e+06 \n", + "mean 2.779269e-01 4.420696e-02 1.091173e-01 2.446167e+00 5.843688e-01 \n", + "std 1.055313e+00 2.449359e-01 4.555363e-01 4.727924e+00 1.866589e+00 \n", + "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 \n", + "50% 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n", + "75% 0.000000e+00 0.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00 \n", + "max 1.020000e+02 2.100000e+01 3.100000e+01 3.000000e+02 1.870000e+02 \n", + "\n", + " sexual_explicit identity_annotator_count toxicity_annotator_count \n", + "count 1.804874e+06 1.804874e+06 1.804874e+06 \n", + "mean 6.605974e-03 1.439019e+00 8.784694e+00 \n", + "std 4.529782e-02 1.787041e+01 4.350086e+01 \n", + "min 0.000000e+00 0.000000e+00 3.000000e+00 \n", + "25% 0.000000e+00 0.000000e+00 4.000000e+00 \n", + "50% 0.000000e+00 0.000000e+00 4.000000e+00 \n", + "75% 0.000000e+00 0.000000e+00 6.000000e+00 \n", + "max 1.000000e+00 1.866000e+03 4.936000e+03 \n", + "\n", + "[8 rows x 42 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Como ai lab solo acepta dos columnas, texto y class entonces eliminaremos todas las columnas excepto comment_text y target" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "dataset2cols = dataset.loc[:, dataset.columns.intersection(['target','comment_text'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cambiamos el orden de las columnas para que target esté después de comment text" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dataset2cols = dataset2cols.reindex(columns=['comment_text','target'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_texttarget
0This is so cool. It's like, 'would you want yo...0.000000
1Thank you!! This would make my life a lot less...0.000000
2This is such an urgent design problem; kudos t...0.000000
3Is this something I'll be able to install on m...0.000000
4haha you guys are a bunch of losers.0.893617
\n", + "
" + ], + "text/plain": [ + " comment_text target\n", + "0 This is so cool. It's like, 'would you want yo... 0.000000\n", + "1 Thank you!! This would make my life a lot less... 0.000000\n", + "2 This is such an urgent design problem; kudos t... 0.000000\n", + "3 Is this something I'll be able to install on m... 0.000000\n", + "4 haha you guys are a bunch of losers. 0.893617" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset2cols.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ahora como ai lab solo acepta valores discretos tendremos que especificar un treshold para definir nuestras dos clases: no ofensivo (0) / ofensivo (1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_texttarget
4haha you guys are a bunch of losers.0.893617
5ur a sh*tty comment.0.666667
6hahahahahahahahhha suck it.0.457627
11This is a great story. Man. I wonder if the pe...0.440000
13It's ridiculous that these guys are being call...0.600000
.........
1804852Xi and his comrades must be smirking over Trum...0.300000
1804856Believing in God or not believing in God are p...0.500000
1804857I take your point, but I think you're shooting...0.700000
1804866There's no whine like Alberta whine!0.300000
1804872Anyone who is quoted as having the following e...0.621212
\n", + "

266089 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " comment_text target\n", + "4 haha you guys are a bunch of losers. 0.893617\n", + "5 ur a sh*tty comment. 0.666667\n", + "6 hahahahahahahahhha suck it. 0.457627\n", + "11 This is a great story. Man. I wonder if the pe... 0.440000\n", + "13 It's ridiculous that these guys are being call... 0.600000\n", + "... ... ...\n", + "1804852 Xi and his comrades must be smirking over Trum... 0.300000\n", + "1804856 Believing in God or not believing in God are p... 0.500000\n", + "1804857 I take your point, but I think you're shooting... 0.700000\n", + "1804866 There's no whine like Alberta whine! 0.300000\n", + "1804872 Anyone who is quoted as having the following e... 0.621212\n", + "\n", + "[266089 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset2cols[dataset2cols.target >= THRESHOLD]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "dataset2cols.insert(2, LABEL_COL_NAME, np.zeros(len(dataset2cols)), True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
comment_texttargetLabel
0This is so cool. It's like, 'would you want yo...0.0000000.0
1Thank you!! This would make my life a lot less...0.0000000.0
2This is such an urgent design problem; kudos t...0.0000000.0
3Is this something I'll be able to install on m...0.0000000.0
4haha you guys are a bunch of losers.0.8936170.0
............
1804869Maybe the tax on \"things\" would be collected w...0.0000000.0
1804870What do you call people who STILL think the di...0.0000000.0
1804871thank you ,,,right or wrong,,, i am following ...0.0000000.0
1804872Anyone who is quoted as having the following e...0.6212120.0
1804873Students defined as EBD are legally just as di...0.0000000.0
\n", + "

1804874 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " comment_text target Label\n", + "0 This is so cool. It's like, 'would you want yo... 0.000000 0.0\n", + "1 Thank you!! This would make my life a lot less... 0.000000 0.0\n", + "2 This is such an urgent design problem; kudos t... 0.000000 0.0\n", + "3 Is this something I'll be able to install on m... 0.000000 0.0\n", + "4 haha you guys are a bunch of losers. 0.893617 0.0\n", + "... ... ... ...\n", + "1804869 Maybe the tax on \"things\" would be collected w... 0.000000 0.0\n", + "1804870 What do you call people who STILL think the di... 0.000000 0.0\n", + "1804871 thank you ,,,right or wrong,,, i am following ... 0.000000 0.0\n", + "1804872 Anyone who is quoted as having the following e... 0.621212 0.0\n", + "1804873 Students defined as EBD are legally just as di... 0.000000 0.0\n", + "\n", + "[1804874 rows x 3 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset2cols" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "dataset2cols[LABEL_COL_NAME].values[dataset2cols.target >= THRESHOLD] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetLabel
count266089.000000266089.0
mean0.5174491.0
std0.1868040.0
min0.3000001.0
25%0.4000001.0
50%0.5000001.0
75%0.6447371.0
max1.0000001.0
\n", + "
" + ], + "text/plain": [ + " target Label\n", + "count 266089.000000 266089.0\n", + "mean 0.517449 1.0\n", + "std 0.186804 0.0\n", + "min 0.300000 1.0\n", + "25% 0.400000 1.0\n", + "50% 0.500000 1.0\n", + "75% 0.644737 1.0\n", + "max 1.000000 1.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset2cols[dataset2cols.target >= THRESHOLD].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "finalDataset = dataset2cols.reindex(columns=['comment_text',LABEL_COL_NAME])\n", + "finalDataset = finalDataset.rename(columns={\"comment_text\": TEXT_COL_NAME})\n", + "finalDataset = setLabels(finalDataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextLabel
0This is so cool. It's like, 'would you want yo...HEALTHY
1Thank you!! This would make my life a lot less...HEALTHY
2This is such an urgent design problem; kudos t...HEALTHY
3Is this something I'll be able to install on m...HEALTHY
4haha you guys are a bunch of losers.TOXIC
\n", + "
" + ], + "text/plain": [ + " Text Label\n", + "0 This is so cool. It's like, 'would you want yo... HEALTHY\n", + "1 Thank you!! This would make my life a lot less... HEALTHY\n", + "2 This is such an urgent design problem; kudos t... HEALTHY\n", + "3 Is this something I'll be able to install on m... HEALTHY\n", + "4 haha you guys are a bunch of losers. TOXIC" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "finalDataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total of positives: 266089 Total of negatives: 1538785\n", + "Total of positives exported: 300 Total of negatives exported: 300\n", + "Total of positives: 266089 Total of negatives: 1538785\n", + "Total of positives exported: 3000 Total of negatives exported: 3000\n", + "Total of positives: 266089 Total of negatives: 1538785\n", + "Total of positives exported: 30000 Total of negatives exported: 30000\n", + "Total of positives: 266089 Total of negatives: 1538785\n", + "Total of positives exported: 50000 Total of negatives exported: 50000\n", + "Total of positives: 266089 Total of negatives: 1538785\n", + "Total of positives exported: 100000 Total of negatives exported: 100000\n", + "Error while deleting file train_small_100k.csv\n", + "Error while deleting file train_small_100k.zip\n" + ] + } + ], + "source": [ + "exportCSV(finalDataset,'train_small_300',(300,300))\n", + "exportCSV(finalDataset,'train_small_3k',(3000,3000))\n", + "exportCSV(finalDataset,'train_small_30k',(30000,30000))\n", + "exportCSV(finalDataset,'train_small_50k',(50000,50000))\n", + "exportCSV(finalDataset,'train_small_100k',(100000,100000))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Despues de tener nuestro dataset listo y exportado vamos ahora a trabajar nuestro dataset de validacion que se encuentra acá https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/data\n", + "\n", + "Si en el dataset de validacion la clase es 2 entonces es no ofensivo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0counthate_speechoffensive_languageneitherclasstweet
0030032!!! RT @mayasolovely: As a woman you shouldn't...
1130301!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2230301!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3330211!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4460601!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 count hate_speech offensive_language neither class \\\n", + "0 0 3 0 0 3 2 \n", + "1 1 3 0 3 0 1 \n", + "2 2 3 0 3 0 1 \n", + "3 3 3 0 2 1 1 \n", + "4 4 6 0 6 0 1 \n", + "\n", + " tweet \n", + "0 !!! RT @mayasolovely: As a woman you shouldn't... \n", + "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... \n", + "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... \n", + "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... \n", + "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_ds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "validation_ds = validation_ds.loc[:, validation_ds.columns.intersection(['tweet','class'])]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classtweet
02!!! RT @mayasolovely: As a woman you shouldn't...
11!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
21!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
31!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
41!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
\n", + "
" + ], + "text/plain": [ + " class tweet\n", + "0 2 !!! RT @mayasolovely: As a woman you shouldn't...\n", + "1 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba...\n", + "2 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...\n", + "3 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...\n", + "4 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you..." + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_ds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "validation_ds.insert(2, LABEL_COL_NAME, np.ones(len(validation_ds)), True)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classtweetLabel
02!!! RT @mayasolovely: As a woman you shouldn't...1.0
11!!!!! RT @mleew17: boy dats cold...tyga dwn ba...1.0
21!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...1.0
31!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...1.0
41!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...1.0
\n", + "
" + ], + "text/plain": [ + " class tweet Label\n", + "0 2 !!! RT @mayasolovely: As a woman you shouldn't... 1.0\n", + "1 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 1.0\n", + "2 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 1.0\n", + "3 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 1.0\n", + "4 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 1.0" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_ds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "validation_ds[LABEL_COL_NAME].values[validation_ds['class'] == 2] = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classtweetLabel
02!!! RT @mayasolovely: As a woman you shouldn't...0.0
11!!!!! RT @mleew17: boy dats cold...tyga dwn ba...1.0
21!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...1.0
31!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...1.0
41!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...1.0
\n", + "
" + ], + "text/plain": [ + " class tweet Label\n", + "0 2 !!! RT @mayasolovely: As a woman you shouldn't... 0.0\n", + "1 1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 1.0\n", + "2 1 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 1.0\n", + "3 1 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 1.0\n", + "4 1 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 1.0" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_ds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "validation_ds = validation_ds.rename(columns={\"tweet\": TEXT_COL_NAME})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "validation_ds = validation_ds.loc[:, validation_ds.columns.intersection([TEXT_COL_NAME,LABEL_COL_NAME])]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextLabel
0!!! RT @mayasolovely: As a woman you shouldn't...0.0
1!!!!! RT @mleew17: boy dats cold...tyga dwn ba...1.0
2!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...1.0
3!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...1.0
4!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...1.0
\n", + "
" + ], + "text/plain": [ + " Text Label\n", + "0 !!! RT @mayasolovely: As a woman you shouldn't... 0.0\n", + "1 !!!!! RT @mleew17: boy dats cold...tyga dwn ba... 1.0\n", + "2 !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby... 1.0\n", + "3 !!!!!!!!! RT @C_G_Anderson: @viva_based she lo... 1.0\n", + "4 !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you... 1.0" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_ds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ahora necesitaremos limpiar nuestro texto de caracteres indeseados para eso usaremos una funcion implementada acá https://stackoverflow.com/questions/54396405/how-can-i-preprocess-nlp-text-lowercase-remove-special-characters-remove-numb\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(sentence):\n", + " sentence=str(sentence)\n", + " sentence = sentence.lower()\n", + " sentence=sentence.replace('{html}',\"\") \n", + " cleanr = re.compile('<.*?>')\n", + " cleantext = re.sub(cleanr, '', sentence)\n", + " rem_url=re.sub(r'http\\S+', '',cleantext)\n", + " rem_num = re.sub('[0-9]+', '', rem_url)\n", + " tokenizer = RegexpTokenizer(r'\\w+')\n", + " tokens = tokenizer.tokenize(rem_num)\n", + " return \" \".join(tokens)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextLabel
0rt mayasolovely as a woman you shouldn t compl...HEALTHY
1rt mleew boy dats cold tyga dwn bad for cuffin...TOXIC
2rt urkindofbrand dawg rt sbabylife you ever fu...TOXIC
3rt c_g_anderson viva_based she look like a trannyTOXIC
4rt shenikaroberts the shit you hear about me m...TOXIC
\n", + "
" + ], + "text/plain": [ + " Text Label\n", + "0 rt mayasolovely as a woman you shouldn t compl... HEALTHY\n", + "1 rt mleew boy dats cold tyga dwn bad for cuffin... TOXIC\n", + "2 rt urkindofbrand dawg rt sbabylife you ever fu... TOXIC\n", + "3 rt c_g_anderson viva_based she look like a tranny TOXIC\n", + "4 rt shenikaroberts the shit you hear about me m... TOXIC" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_ds[TEXT_COL_NAME] = validation_ds[TEXT_COL_NAME].map(lambda s:preprocess(s)) \n", + "validation_ds = setLabels(validation_ds)\n", + "validation_ds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total of positives: 20620 Total of negatives: 4163\n", + "Total of positives exported: 20000 Total of negatives exported: 4000\n" + ] + } + ], + "source": [ + "exportCSV(validation_ds,'validation_small',(20000,4000))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BOW\n", + "TFIDF\n", + "FastText\n", + "DistilBERT\n", + "BERT\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests.ipynb b/tests.ipynb new file mode 100644 index 0000000..06caa95 --- /dev/null +++ b/tests.ipynb @@ -0,0 +1,385 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test with github" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook will take the \"validation\" file and will test it against our AI lab model, in order to know how effective our model is." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "import requests\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextLabel
0ima be home tomorrow so line up a bitch for meTOXIC
1yall hoes still play around with the wrong shi...TOXIC
2binko denzy tryna hoe broski izzyTOXIC
3rt causewereguys flirting with bitches on a tw...TOXIC
4watching the nigger movie menacesociety trying...TOXIC
\n", + "
" + ], + "text/plain": [ + " Text Label\n", + "0 ima be home tomorrow so line up a bitch for me TOXIC\n", + "1 yall hoes still play around with the wrong shi... TOXIC\n", + "2 binko denzy tryna hoe broski izzy TOXIC\n", + "3 rt causewereguys flirting with bitches on a tw... TOXIC\n", + "4 watching the nigger movie menacesociety trying... TOXIC" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation = pd.read_csv(\"./validation.csv\")\n", + "\n", + "\n", + "MODELS_V1 = {\n", + " \"ELMO - WOLI-120k-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V1\" : {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/c9830d4c-9c75-11eb-98a5-de79e166b688\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"ELMO - WOLI-24k-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V1\" : {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/105077b0-9c93-11eb-ba53-de79e166b688\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"ELMO - WOLI-24k-T0.3-VAL - Logistic Regression using TF-IDF vectors V1\" : {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/1071863e-9c94-11eb-ba92-de79e166b688\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words\" : {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/6f575112-9ca6-11eb-aefd-de79e166b688\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"WOLI - Bert -120k FFNN using BERT vectors\": {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/762877ca-9a79-11eb-8d07-de79e166b688\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"WOLI - DistilBert -120k FFNN using DistilBERT vectors\": {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/35ae0026-9a0f-11eb-9a75-de79e166b688\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words\": {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/30d948b8-9d4c-11eb-aa0f-521c6757c414\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " }, \n", + " \"ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words\": {\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/03279852-9d6a-11eb-a90b-8697a6fa86bd\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words\":{\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/49fb832a-9dfa-11eb-b2b9-8697a6fa86bd\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " },\n", + " \"ELMO - WOLI Distilbert T3,0 100k/100k no stop words\":{\n", + " \"url\": \"https://predict-ailab.uruit.com/text/classification/predict/721db1da-9eee-11eb-8df4-8697a6fa86bd\",\n", + " \"accuracy\": None,\n", + " \"predicted_values\" : [],\n", + " }\n", + " \n", + "}\n", + "\n", + "MODELS = {\n", + " \"ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words\": MODELS_V1[\"ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words\"],\n", + " \"ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words\" : MODELS_V1[\"ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words\"],\n", + " \"ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words\": MODELS_V1[\"ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words\"],\n", + " \"ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words\": MODELS_V1[\"ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words\"],\n", + " \"ELMO - WOLI Distilbert T3,0 100k/100k no stop words\": MODELS_V1[\"ELMO - WOLI Distilbert T3,0 100k/100k no stop words\"]\n", + "}\n", + "\n", + "validation.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextLabel
0jonjanke deadspin jtrimbl you sound like you r...TOXIC
1the trash wheel has a twitter and googly eyesHEALTHY
2the_red_sea shethetruththo say that shit to my...TOXIC
3good they trash rt jerseyzbest niggas ate most...TOXIC
4neva been a bitch nigga you don t want them pr...TOXIC
\n", + "
" + ], + "text/plain": [ + " Text Label\n", + "0 jonjanke deadspin jtrimbl you sound like you r... TOXIC\n", + "1 the trash wheel has a twitter and googly eyes HEALTHY\n", + "2 the_red_sea shethetruththo say that shit to my... TOXIC\n", + "3 good they trash rt jerseyzbest niggas ate most... TOXIC\n", + "4 neva been a bitch nigga you don t want them pr... TOXIC" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "allLen = len(validation)\n", + "\n", + "SUB_SET_SIZE = 100\n", + "\n", + "indexes = np.arange(allLen);\n", + "np.random.shuffle(indexes)\n", + "rndIdx = indexes[:SUB_SET_SIZE]\n", + "validation_sub_set = validation.iloc[rndIdx]\n", + "validation_sub_set.reset_index(inplace=True, drop=True)\n", + "validation_sub_set.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def getAccuracy(predicted_values):\n", + " errors_array = []\n", + " for index, row in validation_sub_set.iterrows():\n", + " text = row['Text']\n", + " label = row['Label']\n", + " predicted = predicted_values[index]\n", + " if (label != predicted == 'HEALTHY'):\n", + " errors_array.append([text,label,predicted])\n", + "\n", + " errors_df = pd.DataFrame(errors_array,columns=['Text','Expected','Received'])\n", + " errorsLen = len(errors_array)\n", + " subsetLen = len(validation_sub_set)\n", + " accuracy = 100 - ((errorsLen / subsetLen) * 100)\n", + " return accuracy\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 500/500 [04:01<00:00, 2.34it/s]" + ] + } + ], + "source": [ + "items = MODELS.items();\n", + "allItemsLen = len(items) * len(validation_sub_set);\n", + "headers = {'accept':'application/json','Content-Type': 'application/json'}\n", + "pbar = tqdm(total=allItemsLen)\n", + "# response example {'processing_time': 0.024, 'confidence_score': 0.983, 'result': '1.0'}\n", + "for name, modelData in items:\n", + " predicted_values = []\n", + " for index, row in validation_sub_set.iterrows():\n", + " url = modelData[\"url\"]\n", + " data = '{\"text\":\"'+row['Text']+'\"}'\n", + " response = requests.post(url, headers=headers, data=data)\n", + " response = response.json()\n", + " predicted_value = response['result']\n", + " \n", + " if(predicted_value == \"0.0\"):\n", + " predicted_value = 'HEALTHY'\n", + " elif(predicted_value == \"1.0\"):\n", + " predicted_value = 'TOXIC'\n", + " \n", + " predicted_values.append(predicted_value)\n", + " pbar.update(n=1)\n", + " MODELS[name]['accuracy'] = getAccuracy(predicted_values)\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will verify if our model is effective with our validation data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ELMO - WOLI - Bert - 300/300 FFNN using BERT vectors no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/30d948b8-9d4c-11eb-aa0f-521c6757c414',\n", + " 'accuracy': 73.0,\n", + " 'predicted_values': []},\n", + " 'ELMO - WOLI-240K-T0.3-NO_VAL - Logistic Regression using TF-IDF vectors V2 no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/6f575112-9ca6-11eb-aefd-de79e166b688',\n", + " 'accuracy': 73.0,\n", + " 'predicted_values': []},\n", + " 'ELMO - WOLI - Bert - 30k/30k FFNN using DistilBERT vectors no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/03279852-9d6a-11eb-a90b-8697a6fa86bd',\n", + " 'accuracy': 85.0,\n", + " 'predicted_values': []},\n", + " 'ELMO - WOLI - Bert - 30k/30k FFNN using BERT vectors no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/49fb832a-9dfa-11eb-b2b9-8697a6fa86bd',\n", + " 'accuracy': 88.0,\n", + " 'predicted_values': []},\n", + " 'ELMO - WOLI Distilbert T3,0 100k/100k no stop words': {'url': 'https://predict-ailab.uruit.com/text/classification/predict/721db1da-9eee-11eb-8df4-8697a6fa86bd',\n", + " 'accuracy': 84.0,\n", + " 'predicted_values': []}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "MODELS" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}