Skip to content

Commit

Permalink
Merge pull request #29 from i-dot-ai/feature/ml-pipeline
Browse files Browse the repository at this point in the history
Feature/ml pipeline
  • Loading branch information
nmenezes0 authored Apr 5, 2024
2 parents 64e0018 + 1f15c6c commit c308e03
Show file tree
Hide file tree
Showing 12 changed files with 2,579 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Generated by Django 5.0.3 on 2024-03-26 21:27

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("consultations", "0001_initial"),
]

operations = [
migrations.AlterModelOptions(
name="theme",
options={},
),
migrations.AddField(
model_name="theme",
name="question",
field=models.ForeignKey(
null=True, on_delete=django.db.models.deletion.CASCADE, to="consultations.question"
),
),
migrations.AddConstraint(
model_name="theme",
constraint=models.UniqueConstraint(
fields=("summary", "label", "keywords", "question"), name="unique_up_to_question"
),
),
]
86 changes: 86 additions & 0 deletions consultation_analyser/consultations/ml_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from typing import Dict, List, NamedTuple, Union
from uuid import UUID

import numpy as np
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap.umap_ import UMAP

from consultation_analyser.consultations import models

RANDOM_STATE = 12 # For reproducibility


def get_embeddings_for_question(
answers_list: List[Dict[str, Union[UUID, str]]], embedding_model_name: str = "thenlper/gte-small"
) -> List[Dict[str, Union[UUID, str, np.ndarray]]]:
free_text_responses = [answer["free_text"] for answer in answers_list]
embedding_model = SentenceTransformer(embedding_model_name)
embeddings = embedding_model.encode(free_text_responses)
z = zip(answers_list, embeddings)
answers_list_with_embeddings = [dict(list(d.items()) + [("embedding", embedding)]) for d, embedding in z]
return answers_list_with_embeddings


def get_topic_model(answers_list_with_embeddings: List[Dict[str, Union[UUID, str, np.ndarray]]]) -> BERTopic:
free_text_responses_list = [answer["free_text"] for answer in answers_list_with_embeddings]
embeddings_list = [answer["embedding"] for answer in answers_list_with_embeddings]
embeddings = np.array(embeddings_list)
# Set random_state so that we can reproduce the results
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=RANDOM_STATE)
hdbscan_model = HDBSCAN(
min_cluster_size=3, metric="euclidean", cluster_selection_method="eom", prediction_data=True
)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer()
topic_model = BERTopic(
umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model
)
topic_model.fit_transform(free_text_responses_list, embeddings=embeddings)
return topic_model


def get_answers_and_topics(topic_model: BERTopic, answers_list: List[Dict[str, Union[UUID, str]]]) -> pd.DataFrame:
# Answers free text/IDs need to be in the same order
free_text_responses = [answer["free_text"] for answer in answers_list]
answers_id_list = [answer["id"] for answer in answers_list]
# Assign topics to answers
answers_df = topic_model.get_document_info(free_text_responses)
answers_df["id"] = answers_id_list
answers_df = answers_df[["id", "Topic", "Name", "Representation"]]
return answers_df


def save_themes_to_answers(answers_topics_df: pd.DataFrame) -> None:
for row in answers_topics_df.itertuples():
answer = models.Answer.objects.get(id=row.id)
theme_label = row.Name
theme_keywords = row.Representation
answer.save_theme_to_answer(theme_label=theme_label, theme_keywords=theme_keywords)


def save_themes_for_question(question: models.Question) -> None:
# Order must remain the same - so convert to list
answers_qs = models.Answer.objects.filter(question=question).order_by("created_at")
answers_list = list(answers_qs.values("id", "free_text"))
answers_list_with_embeddings = get_embeddings_for_question(answers_list)
topic_model = get_topic_model(answers_list_with_embeddings)
answers_topics_df = get_answers_and_topics(topic_model, answers_list_with_embeddings)
save_themes_to_answers(answers_topics_df)


def save_themes_for_consultation(consultation_id: UUID) -> None:
questions = models.Question.objects.filter(section__consultation__id=consultation_id, has_free_text=True)
for question in questions:
save_themes_for_question(question)


# TODO - what to do with topic -1 (outliers)
# We save them in the database, we may want to deal with them in the views


# TODO - Generate theme summaries using LLM
18 changes: 16 additions & 2 deletions consultation_analyser/consultations/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ class Theme(UUIDPrimaryKeyModel, TimeStampedModel):
label = models.CharField(max_length=256, blank=True)
summary = models.TextField(blank=True)
keywords = models.JSONField(default=list)
# Duplicates info in Answer model, but needed for uniqueness constraint.
question = models.ForeignKey(Question, on_delete=models.CASCADE, null=True)

class Meta(UUIDPrimaryKeyModel.Meta, TimeStampedModel.Meta):
pass
class Meta:
constraints = [
models.UniqueConstraint(fields=["summary", "label", "keywords", "question"], name="unique_up_to_question"),
]


class Answer(UUIDPrimaryKeyModel, TimeStampedModel):
Expand All @@ -76,3 +80,13 @@ class Answer(UUIDPrimaryKeyModel, TimeStampedModel):

class Meta(UUIDPrimaryKeyModel.Meta, TimeStampedModel.Meta):
pass

def save_theme_to_answer(self, theme_label, theme_keywords):
question = self.question
theme, _ = Theme.objects.get_or_create(
question=question,
label=theme_label,
keywords=theme_keywords,
)
self.theme = theme
self.save()
Binary file modified docs/erd.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit c308e03

Please sign in to comment.