From 97aad0915e0b158f5f0389e84ff861c266249961 Mon Sep 17 00:00:00 2001 From: jcollopy-tulane Date: Wed, 1 May 2024 15:02:34 -0500 Subject: [PATCH] Figures --- notebooks/Experiment-Error_Analysis.ipynb | 910 ++++++++++------------ notebooks/Experiments-BERT.ipynb | 244 ------ notebooks/functions/__init__.py | 0 notebooks/functions/functions_utils.py | 79 -- 4 files changed, 425 insertions(+), 808 deletions(-) delete mode 100644 notebooks/Experiments-BERT.ipynb delete mode 100644 notebooks/functions/__init__.py delete mode 100644 notebooks/functions/functions_utils.py diff --git a/notebooks/Experiment-Error_Analysis.ipynb b/notebooks/Experiment-Error_Analysis.ipynb index 5d5db26..e24ae5e 100644 --- a/notebooks/Experiment-Error_Analysis.ipynb +++ b/notebooks/Experiment-Error_Analysis.ipynb @@ -2,31 +2,202 @@ "cells": [ { "cell_type": "code", - "execution_count": 43, + "execution_count": 150, "id": "723a4fc3-1fe3-4e6c-a7a7-b48cf9aadba7", "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "import numpy as np\n", + "import re\n", + "import pickle\n", + "from sklearn.naive_bayes import BernoulliNB\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.linear_model import LogisticRegression\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import PorterStemmer" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 151, "id": "7a164824-3b43-459e-bdc6-e6ff06e75265", "metadata": {}, "outputs": [], "source": [ + "train_df = pd.read_csv(\"../data/train.csv\")\n", + "val_df = pd.read_csv(\"../data/validation.csv\")\n", + "test_df = pd.read_csv(\"../data/test.csv\")\n", + "\n", "bnb = pd.read_csv(\"bnb_results.csv\")\n", "lr = pd.read_csv(\"lr_results.csv\")\n", "cnn = pd.read_csv(\"cnn_results.csv\")\n", "bert = pd.read_csv(\"bert_results.csv\")" ] }, + { + "cell_type": "markdown", + "id": "12b98887-31f1-44b5-b08d-842924010bd6", + "metadata": {}, + "source": [ + "### Get Models" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "06a86d5c-a86d-43af-bd0c-21da92400556", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
BernoulliNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "BernoulliNB()" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), binary=True)\n", + "basic_vectorizer = CountVectorizer()\n", + "train_x = nb_vectorizer.fit_transform(train_df[\"Stemmed\"])\n", + "train_y = train_df[\"Result_Bin\"]\n", + "\n", + "nb_model = BernoulliNB()\n", + "nb_model.fit(train_x, train_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "0dfaabbb-1a1e-417b-ba78-a088c2b4bfb8", + "metadata": {}, + "outputs": [], + "source": [ + "# Find Post. Dist.\n", + "def find_prob_nb(text, label):\n", + " words = process_text(text).split()\n", + " print(words)\n", + " evidence = dict()\n", + " df = train_df[train_df[\"Result\"] == label]\n", + " for word in words:\n", + " word_count = train_df['Stemmed'].str.contains(word, case=False).sum()\n", + " evidence[word] = word_count/len(train_df)\n", + "\n", + " likelihood = dict()\n", + " for word in words:\n", + " word_count = df['Stemmed'].str.contains(word, case=False).sum()\n", + " likelihood[word] = word_count/len(df)\n", + " prior = len(df)/len(train_df)\n", + " \n", + " ratio = dict()\n", + "\n", + " for key, value in likelihood.items():\n", + " if key in evidence:\n", + " ratio[key] = value / evidence[key]\n", + " post = dict()\n", + " post = {key: value * prior for key, value in ratio.items()}\n", + "\n", + " print(post)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "9b7427af-3d09-4d88-ad2d-af7c25dcaad0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression()" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lr_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))\n", + "train_x = lr_vectorizer.fit_transform(train_df[\"Stemmed\"])\n", + "train_y = train_df[\"Result_Bin\"]\n", + "\n", + "lr_model = LogisticRegression()\n", + "lr_model.fit(train_x, train_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "bc8284f4-a2c2-4386-a6f8-2355c42e2def", + "metadata": {}, + "outputs": [], + "source": [ + "# Gets Coefficients for LR\n", + "def return_coef_lr(input):\n", + "\n", + " model = lr_model\n", + " text = process_text(input)\n", + " text_vec = lr_vectorizer.transform([text])\n", + "\n", + " vocabulary = lr_vectorizer.get_feature_names_out()\n", + " coefficients = lr_model.coef_[0]\n", + "\n", + " \n", + " word_coefficient_map = {word: coef for word, coef in zip(vocabulary, coefficients)}\n", + "\n", + " \n", + " for word in text.split():\n", + " if word in word_coefficient_map:\n", + " print(f\"Word: {word}, Coefficient: {word_coefficient_map[word]}\")\n", + " else:\n", + " print(f\"Word: {word}, Coefficient: 0\") " + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "c628ef6b-588e-451a-889a-75f030550ee5", + "metadata": {}, + "outputs": [], + "source": [ + "# Processing Text\n", + "\n", + "def process_text(document):\n", + " # Tokenize the document\n", + " tokens = document.split()\n", + " tokens = [re.sub(r'^\\W+|\\W+$', '', token) for token in tokens]\n", + " tokens = [token.lower() for token in tokens]\n", + "\n", + " # Remove stopwords\n", + " stop_words = set(stopwords.words('english'))\n", + " tokens = [token for token in tokens if token not in stop_words]\n", + "\n", + " # Stem the tokens\n", + " stemmer = PorterStemmer()\n", + " stemmed_tokens = [stemmer.stem(token) for token in tokens]\n", + "\n", + " # Return the processed text\n", + " return ' '.join(stemmed_tokens)" + ] + }, { "cell_type": "code", - "execution_count": 45, - "id": "fe41ab19-b216-4c5c-b344-96c87b25ffd1", + "execution_count": 157, + "id": "510953a7-1313-4499-b35d-2c16ba936469", "metadata": {}, "outputs": [ { @@ -163,7 +334,7 @@ "4 1 0.999648 " ] }, - "execution_count": 45, + "execution_count": 157, "metadata": {}, "output_type": "execute_result" } @@ -174,8 +345,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "id": "07b5bb6b-d6e6-454e-852a-cc53b9c28571", + "execution_count": 158, + "id": "091b9b0b-fc61-405f-8a03-342f810b1bd2", "metadata": {}, "outputs": [ { @@ -312,7 +483,7 @@ "4 1 0.958090 " ] }, - "execution_count": 46, + "execution_count": 158, "metadata": {}, "output_type": "execute_result" } @@ -323,8 +494,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "id": "3503eb65-a580-4284-b73a-8e8b72473589", + "execution_count": 159, + "id": "a0e64e88-bf4f-4965-82d7-fe1f08b10ee2", "metadata": {}, "outputs": [ { @@ -461,7 +632,7 @@ "4 1 1 " ] }, - "execution_count": 47, + "execution_count": 159, "metadata": {}, "output_type": "execute_result" } @@ -472,191 +643,8 @@ }, { "cell_type": "code", - "execution_count": 67, - "id": "37790c3f-056a-4426-bdb8-e622b1adbc6a", - "metadata": {}, - "outputs": [], - "source": [ - "bnb_tp = bnb[(bnb[\"Result_Bin\"] == 1) & (bnb[\"Predicted_Result\"] == 1)]\n", - "lr_fn = lr[(lr[\"Result_Bin\"] == 1) & (lr[\"Predicted_Result\"] == 0)]\n", - "cnn_fn = cnn[(cnn[\"Result_Bin\"] == 1) & (cnn[\"Predicted_Label\"] == 0)]\n", - "bert_fn = bert[(bert[\"Result_Bin\"] == 1) & (bert[\"Predicted\"] == 0)]" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "62175a58-e883-445e-b282-2a05505f07ae", - "metadata": {}, - "outputs": [], - "source": [ - "first_200_df = bnb_tp.sort_values(\"Predicted_Probability\", ascending = True)\n", - "first_200 = first_200_df['Comment'].head(200)" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "4424f031-2a2c-4c25-a7e7-81b983f1da89", - "metadata": {}, - "outputs": [], - "source": [ - "comments_lr = first_200.isin(lr_fn['Comment'])\n", - "comments_cnn = first_200.isin(cnn_fn['Comment'])\n", - "\n", - "comments_both = first_200[comments_lr & comments_cnn]" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "b9747982-b602-4034-9fbf-a3408da57c2c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "19 Big response against one of the hottest teams ...\n", - "1132 This is my last remaining team-specific subred...\n", - "2114 I point out when he plays well, and I’m gonna ...\n", - "2180 I think what's happening is coaching is trying...\n", - "1489 Problem with being the best is every single te...\n", - "962 Dame’s game is really unbelievable… He’s the b...\n", - "1655 He was questionable with non-covid illness. I ...\n", - "2068 I’m going to keep saying this, Doc is just Coa...\n", - "567 B2B and all those excuses aside, this goes to ...\n", - "647 So we're going to beat the good teams & lose t...\n", - "1629 Fr his playmaking has been nothing short of am...\n", - "2234 Yeah, we were 19-6 at this point last season a...\n", - "446 If dame played like this in Portland, we would...\n", - "2016 I will take the L, but you still think this te...\n", - "2236 Correction: Giannis would’ve bullied AG into g...\n", - "1044 It's because of the strength of schedule i gue...\n", - "544 Teams already sag off him. They celebrate if h...\n", - "1243 Both giannis and dame turned up when it matter...\n", - "1789 Yep people still calling for trades and firing\n", - "2057 Without Giannis or Khris, we beat a Kawhi/PG/H...\n", - "26 But it’s never an excuse when teams have to go...\n", - "1068 some people here complaining we act like the B...\n", - "1430 Doc Rivers > a tree stump > Adrian Griffin.\n", - "342 We shot great from 3, but we shot like ass fro...\n", - "2229 whenever a team beats miami it feels like Chri...\n", - "101 Yeah will be fun seeing him play more. I know ...\n", - "28 He commented on someones ig midway through the...\n", - "33 We've been trying to replace PJ Tucker since h...\n", - "14 End of the bench playing basketball happened lol\n", - "1609 Yep. And if he gets that first one he gets ano...\n", - "133 Super happy with the result. Really good team ...\n", - "986 Playing 3rd game in 4 nights and the first two...\n", - "1693 I'm pretty use to it by now but this game was ...\n", - "1196 Still looking for that sweet spot but I’m fine...\n", - "1962 You are right if we are aiming for winning in ...\n", - "Name: Comment, dtype: object" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "comments_both" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "3c85b034-378e-4c46-8754-1f18cffa7f18", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "19 Big response against one of the hottest teams in the Association right now. Cavs will cause a lot of problems especially if fully healthy. We’ve got a lot of uncertainties with how we will organize as a team but we have the answer to the biggest question: do we have the talent to hang with any team? Good W!\n", - "1132 This is my last remaining team-specific subreddit, the others became too toxic so I unsubbed. This one is heading there too, sadly. I think the Dame trade brought in a bunch of sad Packers fans (there are many) and spoiled what used to be my favorite sports subreddit. I'm still hopeful it'll bounce back ...\n", - "2114 I point out when he plays well, and I’m gonna do the same when he plays like shit. 4/18 isn’t good enough he should know when his shot is off and stop chucking\n", - "2180 I think what's happening is coaching is trying to use a scheme to cater to the defenders. Dame, Payne and Beasley aren't one on one stoppers. So using a switching zone seems effective and Brook can still troll the paint.\n", - "1489 Problem with being the best is every single team brings their best against you. Think we're gonna have our mettle tested very often this year.\n", - "962 Dame’s game is really unbelievable… He’s the best logo shooter of all time statistically. If you come out too far, he’ll explode right past you and shoot or get a dunk/layup/foul. If you grab at him, he’s the best freethrow shooter in the league this season (and headed to 4th career all-time). If your teammate comes over to help, he is an elite playmaker and will make the right pass more often than not. He’s like prime Harden but much smaller, better at shooting, and way more clutch.\n", - "1655 He was questionable with non-covid illness. I thought he looked off but knowing he was sick made me give him a pass, which I doubt many on this sub will do. Wouldn't surprise me if he was under the weather versus Boston too.\n", - "2068 I’m going to keep saying this, Doc is just Coach Bud and that’s fine. Not ideal but it’s fine.\n", - "567 B2B and all those excuses aside, this goes to show that the Bucks are still a threat. Yeah, there's some problems but they still have the second best record in the east and this game shows why.\n", - "647 So we're going to beat the good teams & lose to the bad teams? The opposite of last season. I'll take it.\n", - "1629 Fr his playmaking has been nothing short of amazing. Last game there was a sequence where 2-3 times in a row dame got the ball, drove and made the most perfect right play pass but all the players he passed it to missed. And i was just thinking i really hope this doesnt discourage him from keeping to make the right pass\n", - "2234 Yeah, we were 19-6 at this point last season and that was after the torrid 9-0 start. We are winning a lot more consistently this season and it looks like we’re starting to exit the clutch time purgatory we’ve been stuck in to start the season finally (knock on wood lol)\n", - "446 If dame played like this in Portland, we would have been pissed to trade jrue. Stop moving the goalposts, we expected a superstar and this ain't it\n", - "2016 I will take the L, but you still think this team has nothing to concern with and don’t need trade? Don’t act like Blazers is a good team and this is a convincing win\n", - "2236 Correction: Giannis would’ve bullied AG into getting him back in the game. He knows Doc won’t go for that shit lol. This is why we needed a veteran coach\n", - "1044 It's because of the strength of schedule i guess, but that's something out of the players' control anyways\n", - "544 Teams already sag off him. They celebrate if he takes a 3. I don't disagree with Giannis taking one or two 3's a game, but it just can't be more than that.\n", - "1243 Both giannis and dame turned up when it mattered but I can't trust this defence to save a single basket anymore. Struggling against these lot tells us more about us tbh\n", - "1789 Yep people still calling for trades and firing\n", - "2057 Without Giannis or Khris, we beat a Kawhi/PG/Harden Clippers team and held them to 106 points. Imagine saying that two months ago.\n", - "26 But it’s never an excuse when teams have to go to Denver and play in that altitude with only a day to adjust to it. Media is trash. Anything to protect their darlings.\n", - "1068 some people here complaining we act like the Bucks has lost (they did lose b2b to the Pacers) while they just act like the Spurs is a top 3 team so we barely beating them should be viewed as an accomplishment\n", - "1430 Doc Rivers > a tree stump > Adrian Griffin.\n", - "342 We shot great from 3, but we shot like ass from 2, which is also out of the norm. Most of those looks from 3 were wide open too\n", - "2229 whenever a team beats miami it feels like Christmas morning\n", - "101 Yeah will be fun seeing him play more. I know he’s good but didn’t watch many Blazer games since the Bucks hardly ever play them. Mostly pay attention to the Eastern conference most the year.\n", - "28 He commented on someones ig midway through the game saying he was alright haha i think hell be back pretty soon\n", - "33 We've been trying to replace PJ Tucker since he left, we might have to finally done it.\n", - "14 End of the bench playing basketball happened lol\n", - "1609 Yep. And if he gets that first one he gets another one. Basically he stops shooting those whenever he misses one.\n", - "133 Super happy with the result. Really good team win. Really sad about the lack of MarJon minutes. I have a feeling he's going to be shipped out before the deadline.\n", - "986 Playing 3rd game in 4 nights and the first two against the team with the 2nd fastest pace and the 3rd game against a team with the 3rd fastest pace was obviously an issue for our old asses.\n", - "1693 I'm pretty use to it by now but this game was another one of those that my mind can't wrap around the fact that we have Damian Lillard\n", - "1196 Still looking for that sweet spot but I’m fine with all the experimenting with lineups and schemes early on. Reserving serious judgement for 2024.\n", - "1962 You are right if we are aiming for winning in the regular season and getting into the playoffs only Insert \"This is fine\" meme\n", - "Name: Comment, dtype: object\n" - ] - } - ], - "source": [ - "with pd.option_context('display.max_colwidth', None):\n", - " print(comments_both)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cf46d1c-4eb8-44ab-8e40-a81094e6188e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6479afe-f5f2-4fd5-b87a-4a878475b085", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "711e3fb8-bb68-4ce4-ae0e-c958b2cc5b32", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "846f9cbd-b0ed-48b0-b772-c8cd238334f5", - "metadata": {}, - "outputs": [], - "source": [ - "bnb_fp = bnb[(bnb[\"Result_Bin\"] == 0) & (bnb[\"Predicted_Result\"] == 1)]\n", - "lr_tn = lr[(lr[\"Result_Bin\"] == 0) & (lr[\"Predicted_Result\"] == 0)]\n", - "cnn_tn = cnn[(cnn[\"Result_Bin\"] == 0) & (cnn[\"Predicted_Label\"] == 0)]" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "179a9408-e2df-475a-9e42-db6041d14608", + "execution_count": 160, + "id": "175ff3dc-900c-42c4-bf7e-a948ddcb0b96", "metadata": {}, "outputs": [ { @@ -687,8 +675,7 @@ " No_Stop\n", " Stemmed\n", " Result_Bin\n", - " Predicted_Result\n", - " Predicted_Probability\n", + " Predicted\n", " \n", " \n", " \n", @@ -701,8 +688,18 @@ " feel like became dependent recent late-game he...\n", " feel like becam depend recent late-gam heroic ...\n", " 0\n", - " 0\n", - " 0.225801\n", + " 1\n", + " \n", + " \n", + " 1\n", + " 1\n", + " I like it even more when I don't think we're g...\n", + " Win\n", + " i like it even more when i don't think we're g...\n", + " like even n't think 're gon na win win anyways\n", + " like even n't think 're gon na win win anyway\n", + " 1\n", + " 1\n", " \n", " \n", " 2\n", @@ -714,43 +711,28 @@ " game confirm everyth alreadi knew bobbi ’ play...\n", " 0\n", " 0\n", - " 0.305607\n", " \n", " \n", - " 5\n", - " 5\n", - " Good fight on the road against the defending c...\n", - " Loss\n", - " good fight on the road against the defending c...\n", - " good fight road defending champs bad first gam...\n", - " good fight road defend champ bad first game doc\n", - " 0\n", - " 0\n", - " 0.061192\n", - " \n", - " \n", - " 6\n", - " 6\n", - " Doc is overrated. Major L recruiting him\n", + " 3\n", + " 3\n", + " I’m in shock as a Blazers fan. I know for a fa...\n", " Loss\n", - " doc is overrated major l recruiting him\n", - " doc overrated major l recruiting\n", - " doc overr major l recruit\n", + " i’m in shock as a blazers fan i know for a fac...\n", + " ’ shock blazers fan know fact lillard play way...\n", + " ’ shock blazer fan know fact lillard play way ...\n", " 0\n", " 0\n", - " 0.343441\n", " \n", " \n", - " 8\n", - " 8\n", - " Yep! We got a lot of bucks basketball left woo...\n", - " Loss\n", - " yep we got a lot of bucks basketball left woot...\n", - " yep got lot bucks basketball left woot hope do...\n", - " yep got lot buck basketbal left woot hope dont...\n", - " 0\n", + " 4\n", + " 4\n", + " Can we please change the banner to our current...\n", + " Win\n", + " can we please change the banner to our current...\n", + " please change banner current bucks roster ’ wa...\n", + " pleas chang banner current buck roster ’ want ...\n", + " 1\n", " 0\n", - " 0.436663\n", " \n", " \n", "\n", @@ -759,274 +741,200 @@ "text/plain": [ " Unnamed: 0 Comment Result \\\n", "0 0 I feel like we became too dependent on our rec... Loss \n", + "1 1 I like it even more when I don't think we're g... Win \n", "2 2 This game confirmed everything I already knew ... Loss \n", - "5 5 Good fight on the road against the defending c... Loss \n", - "6 6 Doc is overrated. Major L recruiting him Loss \n", - "8 8 Yep! We got a lot of bucks basketball left woo... Loss \n", + "3 3 I’m in shock as a Blazers fan. I know for a fa... Loss \n", + "4 4 Can we please change the banner to our current... Win \n", "\n", " Comment_Adj \\\n", "0 i feel like we became too dependent on our rec... \n", + "1 i like it even more when i don't think we're g... \n", "2 this game confirmed everything i already knew ... \n", - "5 good fight on the road against the defending c... \n", - "6 doc is overrated major l recruiting him \n", - "8 yep we got a lot of bucks basketball left woot... \n", + "3 i’m in shock as a blazers fan i know for a fac... \n", + "4 can we please change the banner to our current... \n", "\n", " No_Stop \\\n", "0 feel like became dependent recent late-game he... \n", + "1 like even n't think 're gon na win win anyways \n", "2 game confirmed everything already knew bobby ’... \n", - "5 good fight road defending champs bad first gam... \n", - "6 doc overrated major l recruiting \n", - "8 yep got lot bucks basketball left woot hope do... \n", - "\n", - " Stemmed Result_Bin \\\n", - "0 feel like becam depend recent late-gam heroic ... 0 \n", - "2 game confirm everyth alreadi knew bobbi ’ play... 0 \n", - "5 good fight road defend champ bad first game doc 0 \n", - "6 doc overr major l recruit 0 \n", - "8 yep got lot buck basketbal left woot hope dont... 0 \n", + "3 ’ shock blazers fan know fact lillard play way... \n", + "4 please change banner current bucks roster ’ wa... \n", "\n", - " Predicted_Result Predicted_Probability \n", - "0 0 0.225801 \n", - "2 0 0.305607 \n", - "5 0 0.061192 \n", - "6 0 0.343441 \n", - "8 0 0.436663 " + " Stemmed Result_Bin Predicted \n", + "0 feel like becam depend recent late-gam heroic ... 0 1 \n", + "1 like even n't think 're gon na win win anyway 1 1 \n", + "2 game confirm everyth alreadi knew bobbi ’ play... 0 0 \n", + "3 ’ shock blazer fan know fact lillard play way ... 0 0 \n", + "4 pleas chang banner current buck roster ’ want ... 1 0 " ] }, - "execution_count": 49, + "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "lr_tn.head()" + "bert.head()" ] }, { - "cell_type": "code", - "execution_count": 50, - "id": "00b14cb7-40b5-49b3-949c-6bc69c61c3a8", + "cell_type": "markdown", + "id": "f2e8667b-9b6d-4b12-9b00-d0c6ab2ea194", "metadata": {}, - "outputs": [], "source": [ - "first_200_df = bnb_fp.sort_values(\"Predicted_Probability\", ascending = False)\n", - "first_200 = first_200_df['Comment'].head(200)" + "## Evaluating False Negatives for LR, BERT, and CNN" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "03ad08ff-5653-4404-a3b4-b5ff1e473030", + "execution_count": 161, + "id": "ee1e09c8-4a76-4e4d-bf52-5ef417d41680", "metadata": {}, "outputs": [], "source": [ - "comments_lr = first_200.isin(lr_tn['Comment'])\n", - "comments_cnn = first_200.isin(cnn_tn['Comment'])\n", - "\n", - "comments_both = first_200[comments_lr & comments_cnn]" + "bnb_tp = bnb[(bnb[\"Result\"] == \"Win\") & (bnb[\"Predicted_Result\"] == 1)]\n", + "lr_fn = lr[(lr[\"Result\"] == \"Win\") & (lr[\"Predicted_Result\"] == 0)]\n", + "cnn_fn = cnn[(cnn[\"Result\"] == \"Win\") & (cnn[\"Predicted_Label\"] == 0)]\n", + "bert_fn = bert[(bert[\"Result\"] == \"Win\") & (bert[\"Predicted\"] == 0)]" ] }, { "cell_type": "code", - "execution_count": 52, - "id": "7993b8fe-2a25-4563-a001-725984d45602", + "execution_count": 162, + "id": "7d5a379e-df32-461d-8783-7e911d437a84", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2223 Livingston had some decent moments on both ends tonight. Hopefully can see some improvement with more minutes\n", - "1373 They used to be bad… really bad…\n", - "1876 Rather him than 3-13 dame tbh\n", - "1213 Having watched this game: Yes. The Bucks are really fucking stupid.\n", - "580 If you can't suggest players in the proposed trade, then this is useless.\n", - "Name: Comment, dtype: object\n" + "Yep people still calling for trades and firing\n", + "Dame’s game is really unbelievable… He’s the best logo shooter of all time statistically. If you come out too far, he’ll explode right past you and shoot or get a dunk/layup/foul. If you grab at him, he’s the best freethrow shooter in the league this season (and headed to 4th career all-time). If your teammate comes over to help, he is an elite playmaker and will make the right pass more often than not. He’s like prime Harden but much smaller, better at shooting, and way more clutch.\n", + "on ESPN they said could be the next game or the game after next\n", + "Malik is playing well, sending him to the bench now will throw him off\n" ] } ], "source": [ - "with pd.option_context('display.max_colwidth', None):\n", - " print(comments_both)" + "bnb_comments = set(bnb_tp[\"Comment\"])\n", + "lr_comments = set(lr_fn[\"Comment\"])\n", + "cnn_comments = set(cnn_fn[\"Comment\"])\n", + "bert_comments = set(bert_fn[\"Comment\"])\n", + "\n", + "common_comments = lr_comments.intersection(cnn_comments, bert_comments, bnb_comments)\n", + "\n", + "for i, comment in enumerate(common_comments):\n", + " if i < 4:\n", + " print(comment)\n", + " else:\n", + " break" ] }, { "cell_type": "code", - "execution_count": 54, - "id": "f3d5f504-c9cb-44ea-9343-ede7c9462493", + "execution_count": 163, + "id": "6540bfc4-669c-470d-9847-1d4a94003940", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Unnamed: 0 Comment Result \\\n", - "892 892 If the scheme that you designed is so bad that... Loss \n", - "921 921 I can't much blame Livingston for his offense ... Loss \n", - "929 929 I think there's probs some hesitation to shoot... Loss \n", - "1809 1809 MarJon was already out of the rotation - and I... Loss \n", - "2223 2223 Livingston had some decent moments on both end... Loss \n", - "\n", - " Comment_Adj \\\n", - "892 if the scheme that you designed is so bad that... \n", - "921 i can't much blame livingston for his offense ... \n", - "929 i think there's probs some hesitation to shoot... \n", - "1809 marjon was already out of the rotation and i ... \n", - "2223 livingston had some decent moments on both end... \n", - "\n", - " No_Stop \\\n", - "892 scheme designed bad chris livingston best play... \n", - "921 ca n't much blame livingston offense game cons... \n", - "929 think 's probs hesitation shoot specifically a... \n", - "1809 marjon already rotation n't know 've noticed '... \n", - "2223 livingston decent moments ends tonight hopeful... \n", - "\n", - " Stemmed Result_Bin \\\n", - "892 scheme design bad chri livingston best player ... 0 \n", - "921 ca n't much blame livingston offens game consi... 0 \n", - "929 think 's prob hesit shoot specif ajj livingsto... 0 \n", - "1809 marjon alreadi rotat n't know 've notic 's lin... 0 \n", - "2223 livingston decent moment end tonight hope see ... 0 \n", - "\n", - " Predicted_Result Predicted_Probability \n", - "892 0 0.486864 \n", - "921 0 0.263094 \n", - "929 0 0.295788 \n", - "1809 0 0.158327 \n", - "2223 0 0.382528 \n" + "Word: malik, Coefficient: -0.5406748829539177\n", + "Word: play, Coefficient: -0.056864372549471444\n", + "Word: well, Coefficient: -0.24822874624598537\n", + "Word: send, Coefficient: 0.4619140404395336\n", + "Word: bench, Coefficient: -0.3270444798255762\n", + "Word: throw, Coefficient: -0.0845347188654442\n" ] } ], "source": [ - "target_comment = \"Livingston\"\n", - "\n", - "# Find the comment in the dataframe\n", - "found_comment = lr_tn[lr_tn['Comment'].str.contains(target_comment)]\n", - "\n", - "# Print the found comment\n", - "print(found_comment)" + "target_comment = \"Malik is playing well, sending him to the bench now will throw him off\"\n", + "return_coef_lr(target_comment)" ] }, { "cell_type": "code", - "execution_count": 55, - "id": "05162f82-3e62-4153-9480-5642d5cb1218", + "execution_count": 164, + "id": "39955afd-8b36-43f6-b434-8127598342ba", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Unnamed: 0 Comment Result \\\n", - "892 892 If the scheme that you designed is so bad that... Loss \n", - "921 921 I can't much blame Livingston for his offense ... Loss \n", - "929 929 I think there's probs some hesitation to shoot... Loss \n", - "1075 1075 Wiggins doesn’t think he’s a superstar and is ... Win \n", - "1209 1209 My dream of Chris Livingston Jr becoming a rot... Win \n", - "1235 1235 The Blazers are good at their developing their... Loss \n", - "1809 1809 MarJon was already out of the rotation - and I... Loss \n", - "2223 2223 Livingston had some decent moments on both end... Loss \n", - "\n", - " Comment_Adj \\\n", - "892 if the scheme that you designed is so bad that... \n", - "921 i can't much blame livingston for his offense ... \n", - "929 i think there's probs some hesitation to shoot... \n", - "1075 wiggins doesn’t think he’s a superstar and is ... \n", - "1209 my dream of chris livingston jr becoming a rot... \n", - "1235 the blazers are good at their developing their... \n", - "1809 marjon was already out of the rotation and i ... \n", - "2223 livingston had some decent moments on both end... \n", - "\n", - " No_Stop \\\n", - "892 scheme designed bad chris livingston best play... \n", - "921 ca n't much blame livingston offense game cons... \n", - "929 think 's probs hesitation shoot specifically a... \n", - "1075 wiggins ’ think ’ superstar smart enough make ... \n", - "1209 dream chris livingston jr becoming rotation pl... \n", - "1235 blazers good developing players simply underst... \n", - "1809 marjon already rotation n't know 've noticed '... \n", - "2223 livingston decent moments ends tonight hopeful... \n", - "\n", - " Stemmed Result_Bin \\\n", - "892 scheme design bad chri livingston best player ... 0 \n", - "921 ca n't much blame livingston offens game consi... 0 \n", - "929 think 's prob hesit shoot specif ajj livingsto... 0 \n", - "1075 wiggin ’ think ’ superstar smart enough make e... 1 \n", - "1209 dream chri livingston jr becom rotat player al... 1 \n", - "1235 blazer good develop player simpli understand y... 0 \n", - "1809 marjon alreadi rotat n't know 've notic 's lin... 0 \n", - "2223 livingston decent moment end tonight hope see ... 0 \n", - "\n", - " Predicted_Result Predicted_Probability \n", - "892 0 0.486864 \n", - "921 0 0.263094 \n", - "929 0 0.295788 \n", - "1075 0 0.022897 \n", - "1209 1 0.827561 \n", - "1235 1 0.558184 \n", - "1809 0 0.158327 \n", - "2223 0 0.382528 \n" + "['malik', 'play', 'well', 'send', 'bench', 'throw']\n", + "{'malik': 0.43478260869565216, 'play': 0.4826001313197636, 'well': 0.4766666666666667, 'send': 0.6428571428571427, 'bench': 0.5251396648044693, 'throw': 0.5132743362831859}\n" ] } ], "source": [ - "target_comment = \"Livingston\"\n", - "\n", - "# Find the comment in the dataframe\n", - "found_comment = lr[lr['Comment'].str.contains(target_comment)]\n", - "\n", - "# Print the found comment\n", - "print(found_comment)" + "find_prob_nb(target_comment, \"Win\")" + ] + }, + { + "cell_type": "markdown", + "id": "845efb3e-655c-43c6-950d-0a19f01eae3b", + "metadata": {}, + "source": [ + "## Evaluating False Positives" ] }, { "cell_type": "code", - "execution_count": 41, - "id": "283aeffc-9189-486c-b3e3-7d007750ff64", + "execution_count": 165, + "id": "846f9cbd-b0ed-48b0-b772-c8cd238334f5", + "metadata": {}, + "outputs": [], + "source": [ + "bnb_fp = bnb[(bnb[\"Result\"] == \"Loss\") & (bnb[\"Predicted_Result\"] == 1)].sort_values(\"Predicted_Probability\", ascending = False)\n", + "lr_tn = lr[(lr[\"Result\"] == \"Loss\") & (lr[\"Predicted_Result\"] == 0)]\n", + "cnn_tn = cnn[(cnn[\"Result\"] == \"Loss\") & (cnn[\"Predicted_Label\"] == 0)]\n", + "bert_tn = bert[(bert[\"Result\"] == \"Loss\") & (bert[\"Predicted\"] == 0)]" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "id": "597cb4c9-131d-465f-a457-bdc1da3c25d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " Unnamed: 0 Comment Result \\\n", - "2223 2223 Livingston had some decent moments on both end... Loss \n", - "\n", - " Comment_Adj \\\n", - "2223 livingston had some decent moments on both end... \n", - "\n", - " No_Stop \\\n", - "2223 livingston decent moments ends tonight hopeful... \n", - "\n", - " Stemmed Result_Bin \\\n", - "2223 livingston decent moment end tonight hope see ... 0 \n", - "\n", - " Predicted_Result Predicted_Probability \n", - "2223 1 0.999957 \n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/hs/br_4rpdj68nc3sfdpgv0xgn80000gn/T/ipykernel_24579/3657591789.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", - " found_comment = bnb_fp[bnb['Comment'].str.contains(target_comment)]\n" + "None of us know what was at the root of the tension between these guys. They might not even understand it themselves.\n", + "Dame is checked out and is perpetually lazy, apathetic, and downright stupid on the court. It’s miserable to watch. I miss Jrue, and Khris is better than Dame right now.\n", + "Don’t let a regular season game in November fool you if both teams matched up in the playoffs the Bucks beat this team in 5\n", + "2 FOR 19 FROM 3 for Lopez/Dame/Khris…unacceptable\n", + "I’ll do u one better. The guy hasn’t made it out of the 2nd round since the big three celtics.\n", + "The Bucks have played the easiest schedule in the NBA so far by a pretty decent margin too.\n", + "Doc has to.be trolling us right? His post game press conference he talked about getting Pat more minutes so he's ready and confident down the stretch.\n", + "Both Pacers and Heat have the shooters to have decent odds of a hot streak from 3 taking some games they shouldnt.\n", + "We won’t see the second round lmao\n", + "If you blame the bench, you don't know basketball. The real problem is the starting 5. They don't click together.\n" ] } ], "source": [ - "target_comment = \"Livingston had some decent moments on both ends tonight. Hopefully can see some improvement with more minutes\"\n", + "bnb_comments = set(bnb_fp[\"Comment\"])\n", + "lr_comments = set(lr_tn[\"Comment\"])\n", + "cnn_comments = set(cnn_tn[\"Comment\"])\n", + "bert_comments = set(bert_tn[\"Comment\"])\n", "\n", - "# Find the comment in the dataframe\n", - "found_comment = bnb_fp[bnb['Comment'].str.contains(target_comment)]\n", + "common_comments = lr_comments.intersection(cnn_comments, bert_comments, bnb_comments)\n", "\n", - "# Print the found comment\n", - "print(found_comment)" + "for i, comment in enumerate(common_comments):\n", + " if i < 10:\n", + " print(comment)\n", + " else:\n", + " break" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 171, "id": "322fdf5b-4f85-4f4a-84c0-45f0a6f14f1a", "metadata": {}, "outputs": [ @@ -1034,105 +942,137 @@ "name": "stdout", "output_type": "stream", "text": [ - " Unnamed: 0 Comment Result \\\n", - "2223 2223 Livingston had some decent moments on both end... Loss \n", - "\n", - " Comment_Adj \\\n", - "2223 livingston had some decent moments on both end... \n", - "\n", - " No_Stop \\\n", - "2223 livingston decent moments ends tonight hopeful... \n", - "\n", - " Stemmed Result_Bin \\\n", - "2223 livingston decent moment end tonight hope see ... 0 \n", - "\n", - " Predicted_Result Predicted_Probability \n", - "2223 0 0.382528 \n" + "Word: dame, Coefficient: -0.1700790422176605\n", + "Word: check, Coefficient: 0.24744862055832584\n", + "Word: perpetu, Coefficient: -0.027956271452183712\n", + "Word: lazi, Coefficient: -0.23844876824699507\n", + "Word: apathet, Coefficient: 0\n", + "Word: downright, Coefficient: 0\n", + "Word: stupid, Coefficient: -0.6631070629775302\n", + "Word: court, Coefficient: 0.146007882187147\n", + "Word: it’, Coefficient: 0\n", + "Word: miser, Coefficient: 0.11285656253966668\n", + "Word: watch, Coefficient: -0.25341505528863745\n", + "Word: miss, Coefficient: -0.4716422624688569\n", + "Word: jrue, Coefficient: -0.33698888558356804\n", + "Word: khri, Coefficient: 0.34432296915886257\n", + "Word: better, Coefficient: 0.15519712399002292\n", + "Word: dame, Coefficient: -0.1700790422176605\n", + "Word: right, Coefficient: -0.2621512216885956\n" ] } ], "source": [ - "target_comment = \"Livingston had some decent moments on both ends tonight. Hopefully can see some improvement with more minutes\"\n", + "target_comment = \"Dame is checked out and is perpetually lazy, apathetic, and downright stupid on the court. It’s miserable to watch. I miss Jrue, and Khris is better than Dame right now\"\n", "\n", - "# Find the comment in the dataframe\n", - "found_comment = lr_tn[lr_tn['Comment'].str.contains(target_comment)]\n", - "\n", - "# Print the found comment\n", - "print(found_comment)" + "return_coef_lr(target_comment)" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "b5d582d2", + "execution_count": 170, + "id": "0f2667ff", "metadata": {}, "outputs": [ { - "ename": "ImportError", - "evalue": "attempted relative import with no known parent package", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpickle\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfunctions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfunctions_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m process_text, basic_process, cnn_process\n", - "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" + "name": "stdout", + "output_type": "stream", + "text": [ + "['dame', 'check', 'perpetu', 'lazi', 'apathet', 'downright', 'stupid', 'court', 'it’', 'miser', 'watch', 'miss', 'jrue', 'khri', 'better', 'dame', 'right']\n", + "{'dame': 0.4727272727272728, 'check': 0.31034482758620685, 'perpetu': 0.49999999999999994, 'lazi': 0.49999999999999994, 'apathet': nan, 'downright': nan, 'stupid': 0.6086956521739131, 'court': 0.4726027397260274, 'it’': nan, 'miser': 0.5555555555555556, 'watch': 0.541958041958042, 'miss': 0.5688888888888889, 'jrue': 0.58, 'khri': 0.40449438202247184, 'better': 0.44941176470588234, 'right': 0.5503875968992248}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/hs/br_4rpdj68nc3sfdpgv0xgn80000gn/T/ipykernel_5441/1060855778.py:21: RuntimeWarning: invalid value encountered in double_scalars\n", + " ratio[key] = value / evidence[key]\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0CommentResultComment_AdjNo_StopStemmedResult_BinPredicted_ResultPredicted_Probability
12821282Dame is checked out and is perpetually lazy, a...Lossdame is checked out and is perpetually lazy ap...dame checked perpetually lazy apathetic downri...dame check perpetu lazi apathet downright stup...010.995509
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Comment Result \\\n", + "1282 1282 Dame is checked out and is perpetually lazy, a... Loss \n", + "\n", + " Comment_Adj \\\n", + "1282 dame is checked out and is perpetually lazy ap... \n", + "\n", + " No_Stop \\\n", + "1282 dame checked perpetually lazy apathetic downri... \n", + "\n", + " Stemmed Result_Bin \\\n", + "1282 dame check perpetu lazi apathet downright stup... 0 \n", + "\n", + " Predicted_Result Predicted_Probability \n", + "1282 1 0.995509 " + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "import pickle\n", - "from ...functions.functions_utils import process_text, basic_process, cnn_process" + "find_prob_nb(target_comment, \"Loss\")\n", + "result = bnb[bnb['Comment'].str.contains(target_comment)]\n", + "result" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0423d0d1", - "metadata": {}, - "outputs": [], - "source": [ - "def return_coef_bnb(text):\n", - "\n", - " model = bnb\n", - " text = process_text(input_field)\n", - " text = bnb_vectorizer.transform([text])\n", - "\n", - " vocabulary = vectorizer.get_feature_names_out()\n", - " coefficients = model.coef_[0]\n", - "\n", - " # Map coefficients to words\n", - " word_coefficient_map = {word: coef for word, coef in zip(vocabulary, coefficients)}\n", - "\n", - " # Print coefficients for words in the input text\n", - " for word in input_text.split():\n", - " if word in word_coefficient_map:\n", - " print(f\"Word: {word}, Coefficient: {word_coefficient_map[word]}\")\n", - " else:\n", - " print(f\"Word: {word}, Coefficient: 0\") " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ddb1cd57", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6f2f431", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (testenv)", "language": "python", - "name": "python3" + "name": "testenv" }, "language_info": { "codemirror_mode": { @@ -1144,7 +1084,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/notebooks/Experiments-BERT.ipynb b/notebooks/Experiments-BERT.ipynb deleted file mode 100644 index e42126c..0000000 --- a/notebooks/Experiments-BERT.ipynb +++ /dev/null @@ -1,244 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "e55024f6-48aa-46ae-ae28-c8e342b41d05", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/testenv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "from transformers import DistilBertTokenizerFast\n", - "from transformers import TFDistilBertForSequenceClassification\n", - "from transformers import set_seed\n", - "import tensorflow as tf\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "81a58130-e8c9-4e5a-8080-8e0a90e98171", - "metadata": {}, - "outputs": [], - "source": [ - "train_df = pd.read_csv(\"../data/train.csv\")\n", - "val_df = pd.read_csv(\"../data/validation.csv\")\n", - "test_df = pd.read_csv(\"../data/test.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "56356a8e-bc73-432d-902e-70f7c6125df7", - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1ba9fa14-53a1-44c4-9d13-1baed5ba3c36", - "metadata": {}, - "outputs": [], - "source": [ - "def tokenize(sentences, max_length=100, padding='max_length'):\n", - " return tokenizer(\n", - " sentences,\n", - " truncation=True,\n", - " padding=padding,\n", - " max_length=max_length,\n", - " return_tensors=\"tf\" \n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "ac239be7-6e4d-4ceb-854d-2031c1c9db7c", - "metadata": {}, - "outputs": [], - "source": [ - "bert_x_train = train_df[\"Comment_Adj\"].tolist()\n", - "bert_y_train = train_df[\"Result_Bin\"].tolist()\n", - "bert_x_val = val_df[\"Comment_Adj\"].tolist()\n", - "bert_y_val = val_df[\"Result_Bin\"].tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "5c468de7-c416-4ec6-8a86-0984f0671c7b", - "metadata": {}, - "outputs": [], - "source": [ - "train_encodings = tokenize(bert_x_train)\n", - "val_encodings = tokenize(bert_x_val)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b4ace8c8-91e6-4a5d-bc5e-2a2dccfebcd8", - "metadata": {}, - "outputs": [], - "source": [ - "train_labels = tf.convert_to_tensor(bert_y_train, dtype=tf.int32)\n", - "val_labels = tf.convert_to_tensor(bert_y_val, dtype=tf.int32)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "783c1b81-4d0b-40bb-9c05-3faa24a9c911", - "metadata": {}, - "outputs": [], - "source": [ - "seed_value = 42\n", - "set_seed(seed_value)\n", - "\n", - "train_dataset = tf.data.Dataset.from_tensor_slices((\n", - " dict(train_encodings), \n", - " train_labels\n", - ")).shuffle(1000).batch(30).prefetch(1)\n", - "\n", - "validation_dataset = tf.data.Dataset.from_tensor_slices((\n", - " dict(val_encodings), \n", - " val_labels\n", - ")).batch(30).prefetch(1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d46d7bd2-808e-415b-878b-5e0146cb2705", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']\n", - "- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/2\n", - "228/228 [==============================] - 976s 4s/step - loss: 0.6641 - accuracy: 0.5987 - val_loss: 0.6537 - val_accuracy: 0.5980\n", - "Epoch 2/2\n", - "228/228 [==============================] - 1016s 4s/step - loss: 0.5443 - accuracy: 0.7240 - val_loss: 0.6240 - val_accuracy: 0.6652\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "seed_value = 42\n", - "set_seed(seed_value)\n", - "\n", - "model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)\n", - "optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)\n", - "model.compile(\n", - " optimizer=optimizer,\n", - " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", - " metrics = [\"accuracy\")\n", - "\n", - "model.fit(\n", - " x=train_dataset,\n", - " y=None,\n", - " validation_data=validation_dataset,\n", - " batch_size=30,\n", - " epochs=2\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "b4882ab2-48f9-472b-b5eb-b817f48c7418", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-04-28 13:32:35.788501: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int32 and shape [2276,100]\n", - "\t [[{{node Placeholder/_1}}]]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "76/76 [==============================] - 93s 1s/step - loss: 0.6230 - accuracy: 0.6586\n", - "Loss: 0.622992217540741\n", - "Accuracy: 0.6586115956306458\n" - ] - } - ], - "source": [ - "bert_x_test = test_df[\"Comment_Adj\"].tolist()\n", - "bert_y_test = test_df[\"Result_Bin\"].tolist()\n", - "\n", - "test_encodings = tokenize(bert_x_test)\n", - "\n", - "test_labels = tf.convert_to_tensor(bert_y_test, dtype=tf.int32)\n", - "\n", - "test_dataset = tf.data.Dataset.from_tensor_slices((\n", - " dict(test_encodings), \n", - " test_labels\n", - ")).shuffle(1000).batch(30).prefetch(1)\n", - "\n", - "results = model.evaluate(test_dataset)\n", - "print(\"Loss:\", results[0])\n", - "print(\"Accuracy:\", results[1])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python (testenv)", - "language": "python", - "name": "testenv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/functions/__init__.py b/notebooks/functions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/notebooks/functions/functions_utils.py b/notebooks/functions/functions_utils.py deleted file mode 100644 index 8971909..0000000 --- a/notebooks/functions/functions_utils.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import re -import pandas as pd -from nltk.corpus import stopwords -from nltk.stem import PorterStemmer -from tensorflow.keras.preprocessing.text import Tokenizer -from tensorflow.keras.preprocessing.sequence import pad_sequences - -def process_text(document): - # Tokenize the document - tokens = document.split() - tokens = [re.sub(r'^\W+|\W+$', '', token) for token in tokens] - tokens = [token.lower() for token in tokens] - - # Remove stopwords - stop_words = set(stopwords.words('english')) - tokens = [token for token in tokens if token not in stop_words] - - # Stem the tokens - stemmer = PorterStemmer() - stemmed_tokens = [stemmer.stem(token) for token in tokens] - - # Return the processed text - return ' '.join(stemmed_tokens) - - -tokenizer = Tokenizer() -train_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv") -val_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv") -test_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv") - -def basic_process(document): - # Tokenize the document - tokens = document.split() - # Remove punctuation at the start and end of each token and convert to lowercase - tokens = [re.sub(r'^\W+|\W+$', '', token).lower() for token in tokens] - # Join processed tokens back into a string - processed_text = ' '.join(tokens) - return processed_text - -def cnn_process(document): - - processed_document = basic_process(document) - tokenizer = Tokenizer() - - texts = pd.concat([train_df["Comment_Adj"], val_df["Comment_Adj"], test_df["Comment_Adj"]]) - tokenizer.fit_on_texts(texts) - - all_sequences = tokenizer.texts_to_sequences(texts) - sequences = tokenizer.texts_to_sequences([processed_document]) - - padded_sequences = pad_sequences(sequences, maxlen=87, padding='post') - return padded_sequences - - -def bert_process(document): - tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') - inputs = tokenizer.encode_plus( - comment, - add_special_tokens=True, - max_length=128, - padding='max_length', - return_attention_mask=True, - truncation=True, - return_tensors='tf' - ) - - input_ids = inputs['input_ids'] - attention_mask = inputs['attention_mask'] - - return input_ids, attention_mask - - - - - - \ No newline at end of file