Skip to content

Commit

Permalink
multimodel testing
Browse files Browse the repository at this point in the history
  • Loading branch information
shivankacker committed Aug 20, 2024
1 parent d60c51a commit 0fb18ce
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 87 deletions.
13 changes: 12 additions & 1 deletion ayushma/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from djangoql.admin import DjangoQLSearchMixin

from ayushma.models.services import Service, TempToken
from ayushma.models.testsuite import Feedback, TestQuestion, TestRun, TestSuite
from ayushma.models.testsuite import (
Feedback,
TestQuestion,
TestResult,
TestRun,
TestSuite,
)

from .models import APIKey, Chat, ChatMessage, Document, Project, User

Expand Down Expand Up @@ -132,4 +138,9 @@ class TestRunAdmin(DjangoQLSearchMixin, admin.ModelAdmin):
pass


@admin.register(TestResult)
class TestResultAdmin(DjangoQLSearchMixin, admin.ModelAdmin):
pass


admin.site.site_header = "Ayushma Admin"
52 changes: 52 additions & 0 deletions ayushma/migrations/0055_testresult_model_testrun_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Generated by Django 4.2.6 on 2024-07-26 11:39

import django.contrib.postgres.fields
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("ayushma", "0054_alter_chat_model_alter_project_model"),
]

operations = [
migrations.AddField(
model_name="testresult",
name="model",
field=models.IntegerField(
blank=True,
choices=[
(1, "Gpt 3 5"),
(2, "Gpt 3 5 16K"),
(3, "Gpt 4"),
(4, "Gpt 4 32K"),
(5, "Gpt 4 Visual"),
(6, "Gpt 4 Turbo"),
(7, "Gpt 4 Omni"),
(8, "Gpt 4 Omni Mini"),
],
null=True,
),
),
migrations.AddField(
model_name="testrun",
name="models",
field=django.contrib.postgres.fields.ArrayField(
base_field=models.IntegerField(
choices=[
(1, "Gpt 3 5"),
(2, "Gpt 3 5 16K"),
(3, "Gpt 4"),
(4, "Gpt 4 32K"),
(5, "Gpt 4 Visual"),
(6, "Gpt 4 Turbo"),
(7, "Gpt 4 Omni"),
(8, "Gpt 4 Omni Mini"),
]
),
default=list,
size=None,
),
),
]
5 changes: 4 additions & 1 deletion ayushma/models/testsuite.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from django.contrib.postgres.fields import ArrayField
from django.db import models
from django.db.models import (
CASCADE,
Expand All @@ -10,7 +11,7 @@

from ayushma.models import Project
from ayushma.models.document import Document
from ayushma.models.enums import FeedBackRating, StatusChoices
from ayushma.models.enums import FeedBackRating, ModelType, StatusChoices
from ayushma.models.users import User
from utils.models.base import BaseModel

Expand All @@ -34,6 +35,7 @@ class TestRun(BaseModel):
project = ForeignKey(Project, on_delete=CASCADE)
status = IntegerField(choices=StatusChoices.choices, default=StatusChoices.RUNNING)
references = models.BooleanField(default=True)
models = ArrayField(models.IntegerField(choices=ModelType.choices), default=list)


class TestResult(BaseModel):
Expand All @@ -45,6 +47,7 @@ class TestResult(BaseModel):
cosine_sim = FloatField()
bleu_score = FloatField()
references = models.ManyToManyField(Document, blank=True)
model = models.IntegerField(choices=ModelType.choices, blank=True, null=True)


class Feedback(BaseModel):
Expand Down
1 change: 1 addition & 0 deletions ayushma/serializers/testsuite.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class Meta:
"status",
"test_results",
"references",
"models",
)
read_only_fields = (
"external_id",
Expand Down
179 changes: 94 additions & 85 deletions ayushma/tasks/testrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,100 +30,109 @@ def mark_test_run_as_completed(self, test_run_id):
test_run = TestRun.objects.get(id=test_run_id)
test_suite = test_run.test_suite
test_questions = test_suite.testquestion_set.all()
test_models = test_run.models

temperature = test_suite.temperature
topk = test_suite.topk

chat = Chat()
chat.title = "Test Run: " + test_run.created_at.strftime("%Y-%m-%d %H:%M:%S")
chat.project = test_run.project
chat.save()

for test_question in test_questions:
sleep(
30
) # Wait for 30 seconds to give previous test question time to complete

if TestRun.objects.get(id=test_run_id).status == StatusChoices.CANCELED:
print("Test run canceled")
return

test_result = TestResult()
test_result.test_run = test_run
test_result.test_question = test_question
test_result.question = test_question.question
test_result.human_answer = test_question.human_answer

try:
english_text = test_question.question
translated_text = test_question.question

if test_question.language != "en":
english_text = translate_text("en-IN", english_text)
translated_text = translate_text(
test_question.language + "-IN", english_text
)
for model in test_models:

if test_run.project.assistant_id:
ai_response = converse_thread(
thread=chat,
english_text=english_text,
openai_key=settings.OPENAI_API_KEY,
)
reference_documents = []
else:
response = next(
converse(
chat = Chat()
chat.title = "Test Run: " + test_run.created_at.strftime(
"%Y-%m-%d %H:%M:%S"
)
chat.project = test_run.project
chat.model = model
chat.save()

for test_question in test_questions:
sleep(
30
) # Wait for 30 seconds to give previous test question time to complete

if TestRun.objects.get(id=test_run_id).status == StatusChoices.CANCELED:
print("Test run canceled")
return

test_result = TestResult()
test_result.test_run = test_run
test_result.test_question = test_question
test_result.question = test_question.question
test_result.human_answer = test_question.human_answer
test_result.model = model

try:
english_text = test_question.question
translated_text = test_question.question

if test_question.language != "en":
english_text = translate_text("en-IN", english_text)
translated_text = translate_text(
test_question.language + "-IN", english_text
)

if test_run.project.assistant_id:
ai_response = converse_thread(
thread=chat,
english_text=english_text,
local_translated_text=translated_text,
openai_key=settings.OPENAI_API_KEY,
chat=chat,
match_number=topk,
stats=dict(),
temperature=temperature,
user_language=test_question.language + "-IN",
stream=False,
generate_audio=False,
fetch_references=test_run.references,
documents=test_question.documents.all(),
)
reference_documents = []
else:
response = next(
converse(
english_text=english_text,
local_translated_text=translated_text,
openai_key=settings.OPENAI_API_KEY,
chat=chat,
match_number=topk,
stats=dict(),
temperature=temperature,
user_language=test_question.language + "-IN",
stream=False,
generate_audio=False,
fetch_references=test_run.references,
documents=test_question.documents.all(),
)
)
ai_response = response.message
reference_documents = response.reference_documents

# Calculate cosine similarity
openai.api_key = settings.OPENAI_API_KEY
ai_response_embedding = get_embedding(ai_response)
human_answer_embedding = get_embedding(test_question.human_answer)
cosine_sim = cosine_similarity(
ai_response_embedding, human_answer_embedding
)
ai_response = response.message
reference_documents = response.reference_documents

# Calculate cosine similarity
openai.api_key = settings.OPENAI_API_KEY
ai_response_embedding = get_embedding(ai_response)
human_answer_embedding = get_embedding(test_question.human_answer)
cosine_sim = cosine_similarity(
ai_response_embedding, human_answer_embedding
)

# Calculate BLEU score ( https://www.nltk.org/api/nltk.translate.bleu_score.html#nltk.translate.bleu_score.SmoothingFunction.__init__ )
reference_tokens = test_question.human_answer.split()
candidate_tokens = ai_response.split()

smoothie = SmoothingFunction().method4
bleu_score = sentence_bleu(
[reference_tokens], candidate_tokens, smoothing_function=smoothie
)

test_result.answer = ai_response
test_result.cosine_sim = cosine_sim
test_result.bleu_score = round(bleu_score, 4)
try:
test_result.references.set(reference_documents.all())
except Exception:
pass

except Exception as e:
print("Error while running test question: ", e)
test_result.answer = ""
test_result.cosine_sim = 0
test_result.bleu_score = 0

finally:
test_result.save()

# Calculate BLEU score ( https://www.nltk.org/api/nltk.translate.bleu_score.html#nltk.translate.bleu_score.SmoothingFunction.__init__ )
reference_tokens = test_question.human_answer.split()
candidate_tokens = ai_response.split()

smoothie = SmoothingFunction().method4
bleu_score = sentence_bleu(
[reference_tokens],
candidate_tokens,
smoothing_function=smoothie,
)

test_result.answer = ai_response
test_result.cosine_sim = cosine_sim
test_result.bleu_score = round(bleu_score, 4)
try:
test_result.references.set(reference_documents.all())
except Exception:
pass

except Exception as e:
print("Error while running test question: ", e)
test_result.answer = ""
test_result.cosine_sim = 0
test_result.bleu_score = 0

finally:
test_result.save()

test_run.status = StatusChoices.COMPLETED
test_run.save()
Expand Down

0 comments on commit 0fb18ce

Please sign in to comment.