Renumics · soerenray · Jan 10, 2024 · Jan 10, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ automl = [
   "shap >= 0.43.0",
 ]
 embedding = [
+  "bertopic >= 0.16.0",
   "transformers[torch] >= 4.30.2",
   "sentence-transformers >= 2.2.1",
   "torchaudio >= 2.0.2",  

diff --git a/sliceguard/embeddings.py b/sliceguard/embeddings.py
@@ -1,5 +1,6 @@
 # Embedding support for text, images, audio
 from multiprocess import set_start_method
+from typing import List
 import pandas as pd
 import datasets
 import numpy as np
@@ -18,14 +19,50 @@ def get_embedding_imports():
 
     return SentenceTransformer, AutoFeatureExtractor, AutoModel, torch
 
+def get_BERTopic_imports():
+    try:
+        from bertopic import BERTopic
+        from bertopic.representation import KeyBERTInspired
+        from bertopic.vectorizers import ClassTfidfTransformer
+        from sklearn.feature_extraction.text import CountVectorizer
+    except:
+        raise Warning(
+            "Optional dependency required! (pip install sliceguard[embedding])"
+        )
+    return CountVectorizer, ClassTfidfTransformer, KeyBERTInspired, BERTopic
+
+def setup_BERTopic(top_n_words: int = 10, embedding_model: str = "all-MiniLM-L6-v2"):
+    CountVectorizer, ClassTfidfTransformer, KeyBERTInspired, BERTopic = get_BERTopic_imports()
+    # prepare BERTopic components
+    vectorizer_model = CountVectorizer(
+        stop_words="english", max_features=17500, ngram_range=(1, 3)
+    )
+    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
+    # The main representation of a topic
+    main_representation = KeyBERTInspired()
+
+    model = BERTopic(
+        top_n_words=top_n_words,
+        verbose=True,
+        vectorizer_model=vectorizer_model,
+        representation_model=main_representation,
+        ctfidf_model=ctfidf_model,
+        calculate_probabilities=True,
+        embedding_model=embedding_model,
+    )
+    return model
+
 
 def generate_text_embeddings(
     texts,
     model_name="all-MiniLM-L6-v2",
     hf_auth_token=None,
     hf_num_proc=None,
     hf_batch_size=1,
+    use_topic_modelling = True,
+    target_values: List[int] = None,
 ):
+
     SentenceTransformer, _, _, torch = get_embedding_imports()
 
     if hf_num_proc:
@@ -37,10 +74,25 @@ def generate_text_embeddings(
     print(
         f"Embedding computation on {device} with batch size {hf_batch_size} and multiprocessing {hf_num_proc}."
     )
-
+    
     model = SentenceTransformer(model_name, device=device, use_auth_token=hf_auth_token)
     embeddings = model.encode(texts, batch_size=hf_batch_size)
-    return embeddings
+    if not use_topic_modelling:
+        return embeddings
+
+    # calclate topic probabilities as they are not in pre-computed embeddings
+    model = setup_BERTopic()
+
+    print(
+        f"Topic probability computation on {device}."
+    )
+    # if all samples have numerical, use them as target
+    if pd.api.types.is_numeric_dtype(pd.Series(target_values)):
+        _, probs = model.fit_transform(texts, embeddings=embeddings, y=target_values)
+    else:
+        _, probs = model.fit_transform(texts, embeddings=embeddings)
+    return probs
+
 
 
 def _extract_embeddings_images(model, feature_extractor, col_name="image"):

diff --git a/sliceguard/sliceguard.py b/sliceguard/sliceguard.py
@@ -592,6 +592,7 @@ def _prepare_data(
             hf_batch_size,
             df,
             mode,
+            y
         )
 
         self._feature_encoders = feature_encoders

diff --git a/sliceguard/utils.py b/sliceguard/utils.py
@@ -150,13 +150,15 @@ def encode_normalize_features(
     hf_batch_size: int,
     df: pd.DataFrame,
     mode: Literal["outlier", "automl", "native"],
+    y: str
 ):
     """
     :param features: Names of features that should be encoded and normalized for later processing.
     :param feature_types: The previously inferred or given types of the respective features.
     :param feature_orders: If ordinal features are present you have to supply an order for each of them.
     :param precomputed_embeddings: Precomputed embeddings that the user might supply.
     :param df: The dataframe containing all the data.
+    :param y: the target values, column name of df 
     """
 
     feature_transformation_pipelines = (
@@ -219,6 +221,7 @@ def encode_normalize_features(
             )  # normalize with unique category count to make compatible with range of one hot encoding
             encoded_data = np.concatenate((encoded_data, ordinal_data), axis=1)
         elif feature_type == "raw" or feature_type == "embedding":
+            use_topic_modelling = False
             # Print model that will be used for computing embeddings
             if col in df.columns and col not in precomputed_embeddings:
                 hf_model_params = (
@@ -275,7 +278,8 @@ def encode_normalize_features(
                 print(
                     f"Warning: Column {col} will be treated as text. If the column {col} is a path to some file it is probably not supported yet!"
                 )
-                embeddings = generate_text_embeddings(df[col].values, **hf_model_params)
+                use_topic_modelling = True
+                embeddings = generate_text_embeddings(df[col].values, **hf_model_params, target_values=df[y], use_topic_modelling = use_topic_modelling)
                 feature_transformation_pipelines[col] = {
                     "hf_model_params": hf_model_params,
                     "embedding_func": generate_text_embeddings,
@@ -312,10 +316,12 @@ def encode_normalize_features(
                 num_dimensions = num_embedding_dimensions
             else:
                 num_dimensions = num_mixed_dimensions
+
+            # Topic modelling does not need to be reduced
+            if not use_topic_modelling:
+                print(f"Using num dimensions {num_dimensions}.")
 
-            print(f"Using num dimensions {num_dimensions}.")
-
-            umap_transformer = umap.UMAP(
+                umap_transformer = umap.UMAP(
                 n_neighbors=min(embeddings.shape[0] - 1, n_neighbors),
                 n_components=min(
                     embeddings.shape[0] - 2,
@@ -324,8 +330,10 @@ def encode_normalize_features(
                 # min_dist=0.0,
                 set_op_mix_ratio=op_mix_ratio_prereduction,
             )
-            reduced_embeddings = umap_transformer.fit_transform(embeddings)
-            feature_transformation_pipelines[col]["umap_reducer"] = umap_transformer
+                reduced_embeddings = umap_transformer.fit_transform(embeddings)
+                feature_transformation_pipelines[col]["umap_reducer"] = umap_transformer
+            else:
+                reduced_embeddings = embeddings
 
             # Do a normalization of the reduced embedding to match one hot encoded and ordinal encoding respectively
             # Therefore we will run hdbscan on the data real quick to do an estimate of the cluster distances.

diff --git a/tests/test_topic_modelling.py b/tests/test_topic_modelling.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import pickle
+from sliceguard import SliceGuard
+from sklearn.metrics import f1_score
+
+def convert_labels_to_numerical(df):
+    df["numerical"] = [
+        1 if label == "positive" else 0 if label == "negative" else -1
+        for label in df["sentiment"]
+    ]
+    return df
+
+# genug Daten für > 1 topics
+
+df = pd.read_csv("test_data/IMDB Dataset.csv")[:50]
+print(df.shape)
+with open("test_data/doc_embeddings.pkl", "rb") as file:
+    embeddings = pickle.load(file)
+print(embeddings.shape)
+embeddings = embeddings[:50]
+print(embeddings.shape)
+
+sg = SliceGuard()
+
+df = convert_labels_to_numerical(df)
+print(df.columns)
+print(df.head())
+
+
+issue_df = sg.find_issues(df, ["review"], y="numerical", metric=f1_score)
+print(issue_df)
+with open("test_data/test_issues_tm.pkl", "wb") as file:
+    pickle.dump(issue_df, file)
-Original file line number
+Diff line change
@@ Expand Up / @@ -592,6 +592,7 @@ def _prepare_data( @@
                 hf_batch_size,
                 df,
                 mode,
+                y
             )
             self._feature_encoders = feature_encoders
@@ Expand Down @@