Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/add topic modelling #66

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ automl = [
"shap >= 0.43.0",
]
embedding = [
"bertopic >= 0.16.0",
"transformers[torch] >= 4.30.2",
"sentence-transformers >= 2.2.1",
"torchaudio >= 2.0.2",
Expand Down
56 changes: 54 additions & 2 deletions sliceguard/embeddings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Embedding support for text, images, audio
from multiprocess import set_start_method
from typing import List
import pandas as pd
import datasets
import numpy as np
Expand All @@ -18,14 +19,50 @@ def get_embedding_imports():

return SentenceTransformer, AutoFeatureExtractor, AutoModel, torch

def get_BERTopic_imports():
try:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
except:
raise Warning(
"Optional dependency required! (pip install sliceguard[embedding])"
)
return CountVectorizer, ClassTfidfTransformer, KeyBERTInspired, BERTopic

def setup_BERTopic(top_n_words: int = 10, embedding_model: str = "all-MiniLM-L6-v2"):
CountVectorizer, ClassTfidfTransformer, KeyBERTInspired, BERTopic = get_BERTopic_imports()
# prepare BERTopic components
vectorizer_model = CountVectorizer(
stop_words="english", max_features=17500, ngram_range=(1, 3)
)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# The main representation of a topic
main_representation = KeyBERTInspired()

model = BERTopic(
top_n_words=top_n_words,
verbose=True,
vectorizer_model=vectorizer_model,
representation_model=main_representation,
ctfidf_model=ctfidf_model,
calculate_probabilities=True,
embedding_model=embedding_model,
)
return model


def generate_text_embeddings(
texts,
model_name="all-MiniLM-L6-v2",
hf_auth_token=None,
hf_num_proc=None,
hf_batch_size=1,
use_topic_modelling = True,
target_values: List[int] = None,
):

SentenceTransformer, _, _, torch = get_embedding_imports()

if hf_num_proc:
Expand All @@ -37,10 +74,25 @@ def generate_text_embeddings(
print(
f"Embedding computation on {device} with batch size {hf_batch_size} and multiprocessing {hf_num_proc}."
)

model = SentenceTransformer(model_name, device=device, use_auth_token=hf_auth_token)
embeddings = model.encode(texts, batch_size=hf_batch_size)
return embeddings
if not use_topic_modelling:
return embeddings

# calclate topic probabilities as they are not in pre-computed embeddings
model = setup_BERTopic()

print(
f"Topic probability computation on {device}."
)
# if all samples have numerical, use them as target
if pd.api.types.is_numeric_dtype(pd.Series(target_values)):
_, probs = model.fit_transform(texts, embeddings=embeddings, y=target_values)
else:
_, probs = model.fit_transform(texts, embeddings=embeddings)
return probs



def _extract_embeddings_images(model, feature_extractor, col_name="image"):
Expand Down
1 change: 1 addition & 0 deletions sliceguard/sliceguard.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def _prepare_data(
hf_batch_size,
df,
mode,
y
)

self._feature_encoders = feature_encoders
Expand Down
20 changes: 14 additions & 6 deletions sliceguard/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,15 @@ def encode_normalize_features(
hf_batch_size: int,
df: pd.DataFrame,
mode: Literal["outlier", "automl", "native"],
y: str
):
"""
:param features: Names of features that should be encoded and normalized for later processing.
:param feature_types: The previously inferred or given types of the respective features.
:param feature_orders: If ordinal features are present you have to supply an order for each of them.
:param precomputed_embeddings: Precomputed embeddings that the user might supply.
:param df: The dataframe containing all the data.
:param y: the target values, column name of df
"""

feature_transformation_pipelines = (
Expand Down Expand Up @@ -219,6 +221,7 @@ def encode_normalize_features(
) # normalize with unique category count to make compatible with range of one hot encoding
encoded_data = np.concatenate((encoded_data, ordinal_data), axis=1)
elif feature_type == "raw" or feature_type == "embedding":
use_topic_modelling = False
# Print model that will be used for computing embeddings
if col in df.columns and col not in precomputed_embeddings:
hf_model_params = (
Expand Down Expand Up @@ -275,7 +278,8 @@ def encode_normalize_features(
print(
f"Warning: Column {col} will be treated as text. If the column {col} is a path to some file it is probably not supported yet!"
)
embeddings = generate_text_embeddings(df[col].values, **hf_model_params)
use_topic_modelling = True
embeddings = generate_text_embeddings(df[col].values, **hf_model_params, target_values=df[y], use_topic_modelling = use_topic_modelling)
feature_transformation_pipelines[col] = {
"hf_model_params": hf_model_params,
"embedding_func": generate_text_embeddings,
Expand Down Expand Up @@ -312,10 +316,12 @@ def encode_normalize_features(
num_dimensions = num_embedding_dimensions
else:
num_dimensions = num_mixed_dimensions

# Topic modelling does not need to be reduced
if not use_topic_modelling:
print(f"Using num dimensions {num_dimensions}.")

print(f"Using num dimensions {num_dimensions}.")

umap_transformer = umap.UMAP(
umap_transformer = umap.UMAP(
n_neighbors=min(embeddings.shape[0] - 1, n_neighbors),
n_components=min(
embeddings.shape[0] - 2,
Expand All @@ -324,8 +330,10 @@ def encode_normalize_features(
# min_dist=0.0,
set_op_mix_ratio=op_mix_ratio_prereduction,
)
reduced_embeddings = umap_transformer.fit_transform(embeddings)
feature_transformation_pipelines[col]["umap_reducer"] = umap_transformer
reduced_embeddings = umap_transformer.fit_transform(embeddings)
feature_transformation_pipelines[col]["umap_reducer"] = umap_transformer
else:
reduced_embeddings = embeddings

# Do a normalization of the reduced embedding to match one hot encoded and ordinal encoding respectively
# Therefore we will run hdbscan on the data real quick to do an estimate of the cluster distances.
Expand Down
33 changes: 33 additions & 0 deletions tests/test_topic_modelling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import pickle
from sliceguard import SliceGuard
from sklearn.metrics import f1_score

def convert_labels_to_numerical(df):
df["numerical"] = [
1 if label == "positive" else 0 if label == "negative" else -1
for label in df["sentiment"]
]
return df

# genug Daten für > 1 topics

df = pd.read_csv("test_data/IMDB Dataset.csv")[:50]
print(df.shape)
with open("test_data/doc_embeddings.pkl", "rb") as file:
embeddings = pickle.load(file)
print(embeddings.shape)
embeddings = embeddings[:50]
print(embeddings.shape)

sg = SliceGuard()

df = convert_labels_to_numerical(df)
print(df.columns)
print(df.head())


issue_df = sg.find_issues(df, ["review"], y="numerical", metric=f1_score)
print(issue_df)
with open("test_data/test_issues_tm.pkl", "wb") as file:
pickle.dump(issue_df, file)