Merge pull request #29 from i-dot-ai/feature/ml-pipeline

Feature/ml pipeline
i-dot-ai · Apr 5, 2024 · c308e03 · c308e03
2 parents 64e0018 + 1f15c6c
commit c308e03
Show file tree

Hide file tree

Showing 12 changed files with 2,579 additions and 21 deletions.
diff --git a/...ion_analyser/consultations/migrations/0002_alter_theme_options_theme_question_and_more.py b/...ion_analyser/consultations/migrations/0002_alter_theme_options_theme_question_and_more.py
@@ -0,0 +1,30 @@
+# Generated by Django 5.0.3 on 2024-03-26 21:27
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("consultations", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name="theme",
+            options={},
+        ),
+        migrations.AddField(
+            model_name="theme",
+            name="question",
+            field=models.ForeignKey(
+                null=True, on_delete=django.db.models.deletion.CASCADE, to="consultations.question"
+            ),
+        ),
+        migrations.AddConstraint(
+            model_name="theme",
+            constraint=models.UniqueConstraint(
+                fields=("summary", "label", "keywords", "question"), name="unique_up_to_question"
+            ),
+        ),
+    ]
diff --git a/consultation_analyser/consultations/ml_pipeline.py b/consultation_analyser/consultations/ml_pipeline.py
@@ -0,0 +1,86 @@
+from typing import Dict, List, NamedTuple, Union
+from uuid import UUID
+
+import numpy as np
+import pandas as pd
+from bertopic import BERTopic
+from bertopic.vectorizers import ClassTfidfTransformer
+from hdbscan import HDBSCAN
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from umap.umap_ import UMAP
+
+from consultation_analyser.consultations import models
+
+RANDOM_STATE = 12  # For reproducibility
+
+
+def get_embeddings_for_question(
+    answers_list: List[Dict[str, Union[UUID, str]]], embedding_model_name: str = "thenlper/gte-small"
+) -> List[Dict[str, Union[UUID, str, np.ndarray]]]:
+    free_text_responses = [answer["free_text"] for answer in answers_list]
+    embedding_model = SentenceTransformer(embedding_model_name)
+    embeddings = embedding_model.encode(free_text_responses)
+    z = zip(answers_list, embeddings)
+    answers_list_with_embeddings = [dict(list(d.items()) + [("embedding", embedding)]) for d, embedding in z]
+    return answers_list_with_embeddings
+
+
+def get_topic_model(answers_list_with_embeddings: List[Dict[str, Union[UUID, str, np.ndarray]]]) -> BERTopic:
+    free_text_responses_list = [answer["free_text"] for answer in answers_list_with_embeddings]
+    embeddings_list = [answer["embedding"] for answer in answers_list_with_embeddings]
+    embeddings = np.array(embeddings_list)
+    # Set random_state so that we can reproduce the results
+    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=RANDOM_STATE)
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=3, metric="euclidean", cluster_selection_method="eom", prediction_data=True
+    )
+    vectorizer_model = CountVectorizer(stop_words="english")
+    ctfidf_model = ClassTfidfTransformer()
+    topic_model = BERTopic(
+        umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model
+    )
+    topic_model.fit_transform(free_text_responses_list, embeddings=embeddings)
+    return topic_model
+
+
+def get_answers_and_topics(topic_model: BERTopic, answers_list: List[Dict[str, Union[UUID, str]]]) -> pd.DataFrame:
+    # Answers free text/IDs need to be in the same order
+    free_text_responses = [answer["free_text"] for answer in answers_list]
+    answers_id_list = [answer["id"] for answer in answers_list]
+    # Assign topics to answers
+    answers_df = topic_model.get_document_info(free_text_responses)
+    answers_df["id"] = answers_id_list
+    answers_df = answers_df[["id", "Topic", "Name", "Representation"]]
+    return answers_df
+
+
+def save_themes_to_answers(answers_topics_df: pd.DataFrame) -> None:
+    for row in answers_topics_df.itertuples():
+        answer = models.Answer.objects.get(id=row.id)
+        theme_label = row.Name
+        theme_keywords = row.Representation
+        answer.save_theme_to_answer(theme_label=theme_label, theme_keywords=theme_keywords)
+
+
+def save_themes_for_question(question: models.Question) -> None:
+    # Order must remain the same - so convert to list
+    answers_qs = models.Answer.objects.filter(question=question).order_by("created_at")
+    answers_list = list(answers_qs.values("id", "free_text"))
+    answers_list_with_embeddings = get_embeddings_for_question(answers_list)
+    topic_model = get_topic_model(answers_list_with_embeddings)
+    answers_topics_df = get_answers_and_topics(topic_model, answers_list_with_embeddings)
+    save_themes_to_answers(answers_topics_df)
+
+
+def save_themes_for_consultation(consultation_id: UUID) -> None:
+    questions = models.Question.objects.filter(section__consultation__id=consultation_id, has_free_text=True)
+    for question in questions:
+        save_themes_for_question(question)
+
+
+# TODO - what to do with topic -1 (outliers)
+# We save them in the database, we may want to deal with them in the views
+
+
+# TODO - Generate theme summaries using LLM
diff --git a/consultation_analyser/consultations/models.py b/consultation_analyser/consultations/models.py
@@ -62,9 +62,13 @@ class Theme(UUIDPrimaryKeyModel, TimeStampedModel):
     label = models.CharField(max_length=256, blank=True)
     summary = models.TextField(blank=True)
     keywords = models.JSONField(default=list)
+    # Duplicates info in Answer model, but needed for uniqueness constraint.
+    question = models.ForeignKey(Question, on_delete=models.CASCADE, null=True)
 
-    class Meta(UUIDPrimaryKeyModel.Meta, TimeStampedModel.Meta):
-        pass
+    class Meta:
+        constraints = [
+            models.UniqueConstraint(fields=["summary", "label", "keywords", "question"], name="unique_up_to_question"),
+        ]
 
 
 class Answer(UUIDPrimaryKeyModel, TimeStampedModel):
@@ -76,3 +80,13 @@ class Answer(UUIDPrimaryKeyModel, TimeStampedModel):
 
     class Meta(UUIDPrimaryKeyModel.Meta, TimeStampedModel.Meta):
         pass
+
+    def save_theme_to_answer(self, theme_label, theme_keywords):
+        question = self.question
+        theme, _ = Theme.objects.get_or_create(
+            question=question,
+            label=theme_label,
+            keywords=theme_keywords,
+        )
+        self.theme = theme
+        self.save()
diff --git a/docs/erd.png b/docs/erd.png