Skip to content

Commit

Permalink
Idea:
Browse files Browse the repository at this point in the history
  • Loading branch information
jcollopy-tulane committed Apr 30, 2024
1 parent b8522ca commit 2c940fa
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 2 deletions.
1 change: 0 additions & 1 deletion nlp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import torch

from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification, AdamW
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,72 @@
"# Print the found comment\n",
"print(found_comment)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b5d582d2",
"metadata": {},
"outputs": [
{
"ename": "ImportError",
"evalue": "attempted relative import with no known parent package",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpickle\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfunctions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfunctions_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m process_text, basic_process, cnn_process\n",
"\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
]
}
],
"source": [
"import pickle\n",
"from ...functions.functions_utils import process_text, basic_process, cnn_process"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0423d0d1",
"metadata": {},
"outputs": [],
"source": [
"def return_coef_bnb(text):\n",
"\n",
" model = bnb\n",
" text = process_text(input_field)\n",
" text = bnb_vectorizer.transform([text])\n",
"\n",
" vocabulary = vectorizer.get_feature_names_out()\n",
" coefficients = model.coef_[0]\n",
"\n",
" # Map coefficients to words\n",
" word_coefficient_map = {word: coef for word, coef in zip(vocabulary, coefficients)}\n",
"\n",
" # Print coefficients for words in the input text\n",
" for word in input_text.split():\n",
" if word in word_coefficient_map:\n",
" print(f\"Word: {word}, Coefficient: {word_coefficient_map[word]}\")\n",
" else:\n",
" print(f\"Word: {word}, Coefficient: 0\") "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddb1cd57",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6f2f431",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -1078,7 +1144,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
"version": "3.9.16"
}
},
"nbformat": 4,
Expand Down
Empty file added notebooks/functions/__init__.py
Empty file.
79 changes: 79 additions & 0 deletions notebooks/functions/functions_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def process_text(document):
# Tokenize the document
tokens = document.split()
tokens = [re.sub(r'^\W+|\W+$', '', token) for token in tokens]
tokens = [token.lower() for token in tokens]

# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]

# Stem the tokens
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

# Return the processed text
return ' '.join(stemmed_tokens)


tokenizer = Tokenizer()
train_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv")
val_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv")
test_df = pd.read_csv("/Users/jackiecollopy/Downloads/project-reddit/data/train.csv")

def basic_process(document):
# Tokenize the document
tokens = document.split()
# Remove punctuation at the start and end of each token and convert to lowercase
tokens = [re.sub(r'^\W+|\W+$', '', token).lower() for token in tokens]
# Join processed tokens back into a string
processed_text = ' '.join(tokens)
return processed_text

def cnn_process(document):

processed_document = basic_process(document)
tokenizer = Tokenizer()

texts = pd.concat([train_df["Comment_Adj"], val_df["Comment_Adj"], test_df["Comment_Adj"]])
tokenizer.fit_on_texts(texts)

all_sequences = tokenizer.texts_to_sequences(texts)
sequences = tokenizer.texts_to_sequences([processed_document])

padded_sequences = pad_sequences(sequences, maxlen=87, padding='post')
return padded_sequences


def bert_process(document):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
inputs = tokenizer.encode_plus(
comment,
add_special_tokens=True,
max_length=128,
padding='max_length',
return_attention_mask=True,
truncation=True,
return_tensors='tf'
)

input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

return input_ids, attention_mask






0 comments on commit 2c940fa

Please sign in to comment.