diff --git a/pyproject.toml b/pyproject.toml index 7e404da..8f107d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ automl = [ "shap >= 0.43.0", ] embedding = [ + "bertopic >= 0.16.0", "transformers[torch] >= 4.30.2", "sentence-transformers >= 2.2.1", "torchaudio >= 2.0.2", diff --git a/sliceguard/embeddings.py b/sliceguard/embeddings.py index 878a8f0..e7be1e9 100644 --- a/sliceguard/embeddings.py +++ b/sliceguard/embeddings.py @@ -1,5 +1,6 @@ # Embedding support for text, images, audio from multiprocess import set_start_method +from typing import List import pandas as pd import datasets import numpy as np @@ -18,6 +19,39 @@ def get_embedding_imports(): return SentenceTransformer, AutoFeatureExtractor, AutoModel, torch +def get_BERTopic_imports(): + try: + from bertopic import BERTopic + from bertopic.representation import KeyBERTInspired + from bertopic.vectorizers import ClassTfidfTransformer + from sklearn.feature_extraction.text import CountVectorizer + except: + raise Warning( + "Optional dependency required! (pip install sliceguard[embedding])" + ) + return CountVectorizer, ClassTfidfTransformer, KeyBERTInspired, BERTopic + +def setup_BERTopic(top_n_words: int = 10, embedding_model: str = "all-MiniLM-L6-v2"): + CountVectorizer, ClassTfidfTransformer, KeyBERTInspired, BERTopic = get_BERTopic_imports() + # prepare BERTopic components + vectorizer_model = CountVectorizer( + stop_words="english", max_features=17500, ngram_range=(1, 3) + ) + ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) + # The main representation of a topic + main_representation = KeyBERTInspired() + + model = BERTopic( + top_n_words=top_n_words, + verbose=True, + vectorizer_model=vectorizer_model, + representation_model=main_representation, + ctfidf_model=ctfidf_model, + calculate_probabilities=True, + embedding_model=embedding_model, + ) + return model + def generate_text_embeddings( texts, @@ -25,7 +59,10 @@ def generate_text_embeddings( hf_auth_token=None, hf_num_proc=None, hf_batch_size=1, + use_topic_modelling = True, + target_values: List[int] = None, ): + SentenceTransformer, _, _, torch = get_embedding_imports() if hf_num_proc: @@ -37,10 +74,25 @@ def generate_text_embeddings( print( f"Embedding computation on {device} with batch size {hf_batch_size} and multiprocessing {hf_num_proc}." ) - + model = SentenceTransformer(model_name, device=device, use_auth_token=hf_auth_token) embeddings = model.encode(texts, batch_size=hf_batch_size) - return embeddings + if not use_topic_modelling: + return embeddings + + # calclate topic probabilities as they are not in pre-computed embeddings + model = setup_BERTopic() + + print( + f"Topic probability computation on {device}." + ) + # if all samples have numerical, use them as target + if pd.api.types.is_numeric_dtype(pd.Series(target_values)): + _, probs = model.fit_transform(texts, embeddings=embeddings, y=target_values) + else: + _, probs = model.fit_transform(texts, embeddings=embeddings) + return probs + def _extract_embeddings_images(model, feature_extractor, col_name="image"): diff --git a/sliceguard/sliceguard.py b/sliceguard/sliceguard.py index 92a3782..f235375 100644 --- a/sliceguard/sliceguard.py +++ b/sliceguard/sliceguard.py @@ -592,6 +592,7 @@ def _prepare_data( hf_batch_size, df, mode, + y ) self._feature_encoders = feature_encoders diff --git a/sliceguard/utils.py b/sliceguard/utils.py index 57ee37f..8de6be2 100644 --- a/sliceguard/utils.py +++ b/sliceguard/utils.py @@ -150,6 +150,7 @@ def encode_normalize_features( hf_batch_size: int, df: pd.DataFrame, mode: Literal["outlier", "automl", "native"], + y: str ): """ :param features: Names of features that should be encoded and normalized for later processing. @@ -157,6 +158,7 @@ def encode_normalize_features( :param feature_orders: If ordinal features are present you have to supply an order for each of them. :param precomputed_embeddings: Precomputed embeddings that the user might supply. :param df: The dataframe containing all the data. + :param y: the target values, column name of df """ feature_transformation_pipelines = ( @@ -219,6 +221,7 @@ def encode_normalize_features( ) # normalize with unique category count to make compatible with range of one hot encoding encoded_data = np.concatenate((encoded_data, ordinal_data), axis=1) elif feature_type == "raw" or feature_type == "embedding": + use_topic_modelling = False # Print model that will be used for computing embeddings if col in df.columns and col not in precomputed_embeddings: hf_model_params = ( @@ -275,7 +278,8 @@ def encode_normalize_features( print( f"Warning: Column {col} will be treated as text. If the column {col} is a path to some file it is probably not supported yet!" ) - embeddings = generate_text_embeddings(df[col].values, **hf_model_params) + use_topic_modelling = True + embeddings = generate_text_embeddings(df[col].values, **hf_model_params, target_values=df[y], use_topic_modelling = use_topic_modelling) feature_transformation_pipelines[col] = { "hf_model_params": hf_model_params, "embedding_func": generate_text_embeddings, @@ -312,10 +316,12 @@ def encode_normalize_features( num_dimensions = num_embedding_dimensions else: num_dimensions = num_mixed_dimensions + + # Topic modelling does not need to be reduced + if not use_topic_modelling: + print(f"Using num dimensions {num_dimensions}.") - print(f"Using num dimensions {num_dimensions}.") - - umap_transformer = umap.UMAP( + umap_transformer = umap.UMAP( n_neighbors=min(embeddings.shape[0] - 1, n_neighbors), n_components=min( embeddings.shape[0] - 2, @@ -324,8 +330,10 @@ def encode_normalize_features( # min_dist=0.0, set_op_mix_ratio=op_mix_ratio_prereduction, ) - reduced_embeddings = umap_transformer.fit_transform(embeddings) - feature_transformation_pipelines[col]["umap_reducer"] = umap_transformer + reduced_embeddings = umap_transformer.fit_transform(embeddings) + feature_transformation_pipelines[col]["umap_reducer"] = umap_transformer + else: + reduced_embeddings = embeddings # Do a normalization of the reduced embedding to match one hot encoded and ordinal encoding respectively # Therefore we will run hdbscan on the data real quick to do an estimate of the cluster distances. diff --git a/tests/test_topic_modelling.py b/tests/test_topic_modelling.py new file mode 100644 index 0000000..d1ae922 --- /dev/null +++ b/tests/test_topic_modelling.py @@ -0,0 +1,33 @@ +import pandas as pd +import pickle +from sliceguard import SliceGuard +from sklearn.metrics import f1_score + +def convert_labels_to_numerical(df): + df["numerical"] = [ + 1 if label == "positive" else 0 if label == "negative" else -1 + for label in df["sentiment"] + ] + return df + +# genug Daten für > 1 topics + +df = pd.read_csv("test_data/IMDB Dataset.csv")[:50] +print(df.shape) +with open("test_data/doc_embeddings.pkl", "rb") as file: + embeddings = pickle.load(file) +print(embeddings.shape) +embeddings = embeddings[:50] +print(embeddings.shape) + +sg = SliceGuard() + +df = convert_labels_to_numerical(df) +print(df.columns) +print(df.head()) + + +issue_df = sg.find_issues(df, ["review"], y="numerical", metric=f1_score) +print(issue_df) +with open("test_data/test_issues_tm.pkl", "wb") as file: + pickle.dump(issue_df, file) \ No newline at end of file