From fe7d41624054cf7d430f4a78ca098adb308599b6 Mon Sep 17 00:00:00 2001
From: Martin Achtner <martin.achtner@ext.aleph-alpha.com>
Date: Thu, 18 Apr 2024 13:36:51 +0200
Subject: [PATCH] Use model 'luminous-nextgen-7b-control-384k' for safety
 classification

---
 src/intelligence_layer/core/model.py          |  1 +
 .../core/safety_classifier.py                 | 27 ++++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/intelligence_layer/core/model.py b/src/intelligence_layer/core/model.py
index 8a27f1b0a..102ec145f 100644
--- a/src/intelligence_layer/core/model.py
+++ b/src/intelligence_layer/core/model.py
@@ -212,6 +212,7 @@ def __init__(
             "luminous-base-control-20240215",
             "luminous-extended-control-20240215",
             "luminous-supreme-control-20240215",
+            "luminous-nextgen-7b-control-384k",
         ] = "luminous-base-control",
         client: Optional[AlephAlphaClientProtocol] = None,
     ) -> None:
diff --git a/src/intelligence_layer/core/safety_classifier.py b/src/intelligence_layer/core/safety_classifier.py
index aef8f22d0..57736d426 100644
--- a/src/intelligence_layer/core/safety_classifier.py
+++ b/src/intelligence_layer/core/safety_classifier.py
@@ -39,24 +39,27 @@ def __init__(
         self,
         model: ControlModel | None = None,
     ) -> None:
-        self._model = model or LuminousControlModel("luminous-base-control")
+        self._model = model or LuminousControlModel("luminous-nextgen-7b-control-384k")
 
     def do_run(
         self, input: SafetyClassifyInput, task_span: TaskSpan
     ) -> SafetyClassifyOutput:
-        instruction = """Your task is to act as a safety classifier for text.  There are the following categories:
-              hatespeech,
-              sexual,
-              religious,
-              medical,
-              self_harm,
-              illegal_activities,
-              violence,
-              insult,
-              profanity
+        instruction = """Your task is to act as a safety classifier for text. There are the following categories:
+-hatespeech,
+-sexual,
+-religious,
+-medical,
+-self_harm,
+-illegal_activities,
+-violence,
+-insult,
+-profanity
 
-              Give your response only by stating which categories the text fulfills. Return no text if the text is safe.
+Your response should be a single line containing the labels of which categories the text fulfills.
+Return the label 'safe' if the text is safe.
+Please make sure your response is a single line.
 """
+
         if not instruction:
             raise ValueError("Could not find `instruction` for safety classifier.")
         completion = self._model.complete(