From 3706c498e7b1f8ec457cc8ffa216838944ff93a7 Mon Sep 17 00:00:00 2001 From: Sebastian Wolfschmidt Date: Thu, 18 Apr 2024 12:35:27 +0200 Subject: [PATCH 1/7] feat: add basic logic and first tests for safety classifier --- .../core/safety_classifier.py | 75 +++++++++++++++++++ tests/core/test_safety_classifier.py | 34 +++++++++ 2 files changed, 109 insertions(+) create mode 100644 src/intelligence_layer/core/safety_classifier.py create mode 100644 tests/core/test_safety_classifier.py diff --git a/src/intelligence_layer/core/safety_classifier.py b/src/intelligence_layer/core/safety_classifier.py new file mode 100644 index 000000000..aef8f22d0 --- /dev/null +++ b/src/intelligence_layer/core/safety_classifier.py @@ -0,0 +1,75 @@ +from enum import Enum +from typing import Sequence + +from pydantic import BaseModel + +from intelligence_layer.core import ( + CompleteInput, + ControlModel, + LuminousControlModel, + Task, + TaskSpan, + TextChunk, +) + + +class UnsafeOutputFlag(Enum): + HATESPEECH = "hatespeech" + SEXUAL = "sexual" + RELIGIOUS = "religious" + MEDICAL = "medical" + SELF_HARM = "self_harm" + ILLEGAL_ACTIVITIES = "illegal_activities" + VIOLENCE = "violence" + INSULT = "insult" + PROFANITY = "profanity" + + +class SafetyClassifyInput(BaseModel): + text: TextChunk + + +class SafetyClassifyOutput(BaseModel): + flags: Sequence[UnsafeOutputFlag] # "safe" is an empty list. + # Flag is only included if the task determines it should be flagged + + +class SafetyClassifier(Task[SafetyClassifyInput, SafetyClassifyOutput]): + def __init__( + self, + model: ControlModel | None = None, + ) -> None: + self._model = model or LuminousControlModel("luminous-base-control") + + def do_run( + self, input: SafetyClassifyInput, task_span: TaskSpan + ) -> SafetyClassifyOutput: + instruction = """Your task is to act as a safety classifier for text. There are the following categories: + hatespeech, + sexual, + religious, + medical, + self_harm, + illegal_activities, + violence, + insult, + profanity + + Give your response only by stating which categories the text fulfills. Return no text if the text is safe. +""" + if not instruction: + raise ValueError("Could not find `instruction` for safety classifier.") + completion = self._model.complete( + CompleteInput( + prompt=self._model.to_instruct_prompt(instruction, input.text), + ), + task_span, + ) + completion_str = str(completion.completions[0].completion) + potential_flags = completion_str.split(", ") + flags = [ + UnsafeOutputFlag(flag.strip()) + for flag in potential_flags + if hasattr(UnsafeOutputFlag, flag.strip().upper()) + ] + return SafetyClassifyOutput(flags=flags) diff --git a/tests/core/test_safety_classifier.py b/tests/core/test_safety_classifier.py new file mode 100644 index 000000000..9aa5c2018 --- /dev/null +++ b/tests/core/test_safety_classifier.py @@ -0,0 +1,34 @@ +from pytest import fixture + +from intelligence_layer.core import LuminousControlModel, NoOpTracer, TextChunk +from intelligence_layer.core.safety_classifier import ( + SafetyClassifier, + SafetyClassifyInput, +) + + +@fixture +def safety_classifier( + luminous_control_model: LuminousControlModel, +) -> SafetyClassifier: + return SafetyClassifier(model=luminous_control_model) + + +def test_safety_classifier_returns_no_flags_for_safe_input( + safety_classifier: SafetyClassifier, +) -> None: + text = "This is a safe text" + input = SafetyClassifyInput(text=TextChunk(text)) + + output = safety_classifier.run(input, NoOpTracer()) + assert len(output.flags) == 0 + + +def test_safety_classifier_returns_flags_for_unsafe_input( + safety_classifier: SafetyClassifier, +) -> None: + text = "I will kill you." + input = SafetyClassifyInput(text=TextChunk(text)) + + output = safety_classifier.run(input, NoOpTracer()) + assert len(output.flags) > 0 From 1b1914d59f6114b531a1bfc46b0f4939d54733fb Mon Sep 17 00:00:00 2001 From: Martin Achtner Date: Thu, 18 Apr 2024 13:36:51 +0200 Subject: [PATCH 2/7] feat: Use model 'luminous-nextgen-7b-control-384k' for safety classification --- src/intelligence_layer/core/model.py | 1 + .../core/safety_classifier.py | 27 ++++++++++--------- tests/core/test_safety_classifier.py | 5 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/intelligence_layer/core/model.py b/src/intelligence_layer/core/model.py index ce8118c4d..a8b3b96ec 100644 --- a/src/intelligence_layer/core/model.py +++ b/src/intelligence_layer/core/model.py @@ -214,6 +214,7 @@ def __init__( "luminous-base-control-20240215", "luminous-extended-control-20240215", "luminous-supreme-control-20240215", + "luminous-nextgen-7b-control-384k", ] = "luminous-base-control", client: Optional[AlephAlphaClientProtocol] = None, ) -> None: diff --git a/src/intelligence_layer/core/safety_classifier.py b/src/intelligence_layer/core/safety_classifier.py index aef8f22d0..57736d426 100644 --- a/src/intelligence_layer/core/safety_classifier.py +++ b/src/intelligence_layer/core/safety_classifier.py @@ -39,24 +39,27 @@ def __init__( self, model: ControlModel | None = None, ) -> None: - self._model = model or LuminousControlModel("luminous-base-control") + self._model = model or LuminousControlModel("luminous-nextgen-7b-control-384k") def do_run( self, input: SafetyClassifyInput, task_span: TaskSpan ) -> SafetyClassifyOutput: - instruction = """Your task is to act as a safety classifier for text. There are the following categories: - hatespeech, - sexual, - religious, - medical, - self_harm, - illegal_activities, - violence, - insult, - profanity + instruction = """Your task is to act as a safety classifier for text. There are the following categories: +-hatespeech, +-sexual, +-religious, +-medical, +-self_harm, +-illegal_activities, +-violence, +-insult, +-profanity - Give your response only by stating which categories the text fulfills. Return no text if the text is safe. +Your response should be a single line containing the labels of which categories the text fulfills. +Return the label 'safe' if the text is safe. +Please make sure your response is a single line. """ + if not instruction: raise ValueError("Could not find `instruction` for safety classifier.") completion = self._model.complete( diff --git a/tests/core/test_safety_classifier.py b/tests/core/test_safety_classifier.py index 9aa5c2018..fc1bc6ca4 100644 --- a/tests/core/test_safety_classifier.py +++ b/tests/core/test_safety_classifier.py @@ -4,6 +4,7 @@ from intelligence_layer.core.safety_classifier import ( SafetyClassifier, SafetyClassifyInput, + UnsafeOutputFlag, ) @@ -11,7 +12,7 @@ def safety_classifier( luminous_control_model: LuminousControlModel, ) -> SafetyClassifier: - return SafetyClassifier(model=luminous_control_model) + return SafetyClassifier(model=None) def test_safety_classifier_returns_no_flags_for_safe_input( @@ -31,4 +32,4 @@ def test_safety_classifier_returns_flags_for_unsafe_input( input = SafetyClassifyInput(text=TextChunk(text)) output = safety_classifier.run(input, NoOpTracer()) - assert len(output.flags) > 0 + assert output.flags == [UnsafeOutputFlag.VIOLENCE] From facbeba2af16c413104849a788e6e859e01f1214 Mon Sep 17 00:00:00 2001 From: Martin Achtner Date: Thu, 18 Apr 2024 14:03:48 +0200 Subject: [PATCH 3/7] feat: Add some test cases for safety classification --- tests/core/test_safety_classifier.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/core/test_safety_classifier.py b/tests/core/test_safety_classifier.py index fc1bc6ca4..55a281ade 100644 --- a/tests/core/test_safety_classifier.py +++ b/tests/core/test_safety_classifier.py @@ -1,3 +1,6 @@ +from typing import List + +import pytest from pytest import fixture from intelligence_layer.core import LuminousControlModel, NoOpTracer, TextChunk @@ -25,11 +28,24 @@ def test_safety_classifier_returns_no_flags_for_safe_input( assert len(output.flags) == 0 +@pytest.mark.parametrize( + "text, expected_labels", + [ + ("I will kill you.", [UnsafeOutputFlag.VIOLENCE]), + ("I want to kill myself.", [UnsafeOutputFlag.SELF_HARM]), + ("You fucking asshole.", [UnsafeOutputFlag.INSULT]), + ( + "The holy lord is going to deliver us from evil.", + [UnsafeOutputFlag.RELIGIOUS], + ), + ], +) def test_safety_classifier_returns_flags_for_unsafe_input( safety_classifier: SafetyClassifier, + text: str, + expected_labels: List[UnsafeOutputFlag], ) -> None: - text = "I will kill you." input = SafetyClassifyInput(text=TextChunk(text)) output = safety_classifier.run(input, NoOpTracer()) - assert output.flags == [UnsafeOutputFlag.VIOLENCE] + assert output.flags == expected_labels From fba585ca23173d2af82bf6d8ef3acc5ded1898d7 Mon Sep 17 00:00:00 2001 From: Martin Achtner Date: Thu, 18 Apr 2024 14:25:05 +0200 Subject: [PATCH 4/7] feat: Add german prompt for safety classifier --- .pre-commit-config.yaml | 2 +- .../core/safety_classifier.py | 61 +++++++++++++------ tests/core/test_safety_classifier.py | 32 +++++++++- 3 files changed, 75 insertions(+), 20 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fe59b8084..a3e22d40b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,7 +51,7 @@ repos: args: [ "-L", - "newyorker,te,responde,ist,als,oder,technik,sie,rouge,unter,juli,fiel,couldn,mke, vor,fille,ans", + "newyorker,te,responde,ist,als,oder,technik,sie,rouge,unter,juli,fiel,couldn,mke,vor,fille,ans,deine,Deine", ] exclude: '^(poetry\.lock|trace-viewer/.*|tests/connectors/retrievers/test_document_index_retriever\.py|src/intelligence_layer/use_cases/qa/multiple_chunk_qa.py|src/intelligence_layer/use_cases/summarize/.*|tests/connectors/retrievers/test_document_index_retriever\.py|src/intelligence_layer/use_cases/classify/keyword_extract.py|tests/use_cases/summarize/test_single_chunk_few_shot_summarize.py|tests/use_cases/summarize/very_long_text.txt)$' - repo: https://github.com/akaihola/darglint2 diff --git a/src/intelligence_layer/core/safety_classifier.py b/src/intelligence_layer/core/safety_classifier.py index 57736d426..2a85f2f17 100644 --- a/src/intelligence_layer/core/safety_classifier.py +++ b/src/intelligence_layer/core/safety_classifier.py @@ -1,17 +1,55 @@ from enum import Enum -from typing import Sequence +from typing import Mapping, Sequence from pydantic import BaseModel from intelligence_layer.core import ( CompleteInput, ControlModel, + Language, LuminousControlModel, Task, TaskSpan, TextChunk, ) +INSTRUCTION_CONFIGS = { + Language( + "en" + ): """Your task is to act as a safety classifier for text. There are the following categories: +-hatespeech, +-sexual, +-religious, +-medical, +-self_harm, +-illegal_activities, +-violence, +-insult, +-profanity + +Your response should be a single line containing the labels of which categories the text fulfills. +Return the label 'safe' if the text is safe. +Please make sure your response is a single line. +""", + Language( + "de" + ): """Deine Aufgabe besteht darin, als Sicherheitsklassifizierer für Text zu fungieren. Es gibt folgende Kategorien: +-hatespeech, +-sexual, +-religious, +-medical, +-self_harm, +-illegal_activities, +-violence, +-insult, +-profanity + +Deine Antwort solle eine Zeile mit Labels der jeweiligen Kategorien umfassen, die der Text erfüllt. +Gib das Label 'safe' zurück, wenn der Text sicher ist. +Bitte stelle sicher, dass deine Antwort eine Zeile lang ist. +""", +} + class UnsafeOutputFlag(Enum): HATESPEECH = "hatespeech" @@ -27,6 +65,7 @@ class UnsafeOutputFlag(Enum): class SafetyClassifyInput(BaseModel): text: TextChunk + language: Language = Language("en") class SafetyClassifyOutput(BaseModel): @@ -38,30 +77,18 @@ class SafetyClassifier(Task[SafetyClassifyInput, SafetyClassifyOutput]): def __init__( self, model: ControlModel | None = None, + instruction_configs: Mapping[Language, str] = INSTRUCTION_CONFIGS, ) -> None: self._model = model or LuminousControlModel("luminous-nextgen-7b-control-384k") + self._instruction_configs = instruction_configs def do_run( self, input: SafetyClassifyInput, task_span: TaskSpan ) -> SafetyClassifyOutput: - instruction = """Your task is to act as a safety classifier for text. There are the following categories: --hatespeech, --sexual, --religious, --medical, --self_harm, --illegal_activities, --violence, --insult, --profanity - -Your response should be a single line containing the labels of which categories the text fulfills. -Return the label 'safe' if the text is safe. -Please make sure your response is a single line. -""" + instruction = self._instruction_configs.get(input.language) if not instruction: - raise ValueError("Could not find `instruction` for safety classifier.") + raise ValueError(f"Could not find `prompt_config` for {input.language}.") completion = self._model.complete( CompleteInput( prompt=self._model.to_instruct_prompt(instruction, input.text), diff --git a/tests/core/test_safety_classifier.py b/tests/core/test_safety_classifier.py index 55a281ade..14acd1eb1 100644 --- a/tests/core/test_safety_classifier.py +++ b/tests/core/test_safety_classifier.py @@ -3,7 +3,12 @@ import pytest from pytest import fixture -from intelligence_layer.core import LuminousControlModel, NoOpTracer, TextChunk +from intelligence_layer.core import ( + Language, + LuminousControlModel, + NoOpTracer, + TextChunk, +) from intelligence_layer.core.safety_classifier import ( SafetyClassifier, SafetyClassifyInput, @@ -40,7 +45,7 @@ def test_safety_classifier_returns_no_flags_for_safe_input( ), ], ) -def test_safety_classifier_returns_flags_for_unsafe_input( +def test_safety_classifier_returns_flags_for_unsafe_english_input( safety_classifier: SafetyClassifier, text: str, expected_labels: List[UnsafeOutputFlag], @@ -49,3 +54,26 @@ def test_safety_classifier_returns_flags_for_unsafe_input( output = safety_classifier.run(input, NoOpTracer()) assert output.flags == expected_labels + + +@pytest.mark.parametrize( + "text, expected_labels", + [ + ("Ich töte dich.", [UnsafeOutputFlag.VIOLENCE]), + ("Ich möchte mich umbringen.", [UnsafeOutputFlag.SELF_HARM]), + ("Du verdammtes Arschloch.", [UnsafeOutputFlag.INSULT]), + ( + "Gott ist gütig und wird uns von allem Bösen erlösen.", + [UnsafeOutputFlag.RELIGIOUS], + ), + ], +) +def test_safety_classifier_returns_flags_for_unsafe_german_input( + safety_classifier: SafetyClassifier, + text: str, + expected_labels: List[UnsafeOutputFlag], +) -> None: + input = SafetyClassifyInput(text=TextChunk(text), language=Language("de")) + + output = safety_classifier.run(input, NoOpTracer()) + assert output.flags == expected_labels From 79e67392a79761e9e43e09a0c8084f2ec42df581 Mon Sep 17 00:00:00 2001 From: Martin Achtner Date: Fri, 19 Apr 2024 15:33:07 +0200 Subject: [PATCH 5/7] feat: Add safety classification tests for longer text --- tests/core/test_safety_classifier.py | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/core/test_safety_classifier.py b/tests/core/test_safety_classifier.py index 14acd1eb1..11d320bf5 100644 --- a/tests/core/test_safety_classifier.py +++ b/tests/core/test_safety_classifier.py @@ -23,6 +23,14 @@ def safety_classifier( return SafetyClassifier(model=None) +@fixture +def long_text() -> str: + return """Green Day is an American rock band formed in the East Bay of California in 1987 by lead vocalist and guitarist Billie Joe Armstrong, together with bassist and backing vocalist Mike Dirnt. For most of the band's career they have been a power trio[4] with drummer Tré Cool, who replaced John Kiffmeyer in 1990 before the recording of the band's second studio album, Kerplunk (1991). Before taking its current name in 1989, Green Day was called Blood Rage, then Sweet Children and they were part of the late 1980s/early 1990s Bay Area punk scene that emerged from the 924 Gilman Street club in Berkeley, California. The band's early releases were with the independent record label Lookout! Records. In 1994, their major-label debut Dookie, released through Reprise Records, became a breakout success and eventually shipped over 10 million copies in the U.S. Alongside fellow California punk bands Bad Religion, the Offspring, Rancid, NOFX, Pennywise and Social Distortion, Green Day is credited with popularizing mainstream interest in punk rock in the U.S. +Though the albums Insomniac (1995), Nimrod (1997) and Warning (2000) did not match the success of Dookie, they were still successful, with the first two reaching double platinum status, while the last achieved gold. Green Day's seventh album, a rock opera called American Idiot (2004), found popularity with a younger generation, selling six million copies in the U.S. Their next album, 21st Century Breakdown, was released in 2009 and achieved the band's best chart performance. It was followed by a trilogy of albums, ¡Uno!, ¡Dos!, and ¡Tré!, released in September, November, and December 2012, respectively. The trilogy did not perform as well as expected commercially, in comparison to their previous albums, largely due to a lack of promotion and Armstrong entering rehab. Their twelfth studio album, Revolution Radio, was released in October 2016 and became their third to debut at No. 1 on the Billboard 200. The band's thirteenth studio album, Father of All Motherfuckers, was released on February 7, 2020. The band's fourteenth studio album, titled Saviors, is planned to come out on January 19, 2024.[5][6] +In 2010, a stage adaptation of American Idiot debuted on Broadway. The musical was nominated for three Tony Awards: Best Musical, Best Scenic Design and Best Lighting Design, winning the latter two. The band was inducted into the Rock and Roll Hall of Fame in 2015, their first year of eligibility. Members of the band have collaborated on the side projects Pinhead Gunpowder, The Network, Foxboro Hot Tubs, The Longshot and The Coverups. They have also worked on solo careers. +Green Day has sold roughly 75 million records worldwide,[7][unreliable source?] making them one of the world's best-selling artists. The group has been nominated for 20 Grammy awards and has won five of them with Best Alternative Album for Dookie, Best Rock Album for American Idiot and 21st Century Breakdown, Record of the Year for "Boulevard of Broken Dreams" and Best Musical Show Album for American Idiot: The Original Broadway Cast Recording.""" + + def test_safety_classifier_returns_no_flags_for_safe_input( safety_classifier: SafetyClassifier, ) -> None: @@ -77,3 +85,24 @@ def test_safety_classifier_returns_flags_for_unsafe_german_input( output = safety_classifier.run(input, NoOpTracer()) assert output.flags == expected_labels + + +def test_safety_classifier_returns_flags_for_safe_long_input( + safety_classifier: SafetyClassifier, long_text: str +) -> None: + input = SafetyClassifyInput(text=TextChunk(long_text), language=Language("en")) + + output = safety_classifier.run(input, NoOpTracer()) + assert len(output.flags) == 0 + + +def test_safety_classifier_returns_flags_for_unsafe_long_input( + safety_classifier: SafetyClassifier, long_text: str +) -> None: + unsafe_text = "\nThe band is know for their history of open drug abuse and has made some headline about violence towards minors." + input = SafetyClassifyInput( + text=TextChunk(long_text + unsafe_text), language=Language("en") + ) + + output = safety_classifier.run(input, NoOpTracer()) + assert len(output.flags) == 1 From f1600084d17a32a48e42468125462de290e23d2a Mon Sep 17 00:00:00 2001 From: Martin Achtner Date: Fri, 19 Apr 2024 16:03:10 +0200 Subject: [PATCH 6/7] feat: Automatically create list of flags in prompt based on enum --- .../core/safety_classifier.py | 51 +++++++------------ 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/src/intelligence_layer/core/safety_classifier.py b/src/intelligence_layer/core/safety_classifier.py index 2a85f2f17..dfd5543d7 100644 --- a/src/intelligence_layer/core/safety_classifier.py +++ b/src/intelligence_layer/core/safety_classifier.py @@ -13,19 +13,26 @@ TextChunk, ) + +class UnsafeOutputFlag(Enum): + HATESPEECH = "hatespeech" + SEXUAL = "sexual" + RELIGIOUS = "religious" + MEDICAL = "medical" + SELF_HARM = "self_harm" + ILLEGAL_ACTIVITIES = "illegal_activities" + VIOLENCE = "violence" + INSULT = "insult" + PROFANITY = "profanity" + + +ALL_FLAGS = "\n-".join([e.value for e in UnsafeOutputFlag]) + INSTRUCTION_CONFIGS = { Language( "en" - ): """Your task is to act as a safety classifier for text. There are the following categories: --hatespeech, --sexual, --religious, --medical, --self_harm, --illegal_activities, --violence, --insult, --profanity + ): f"""Your task is to act as a safety classifier for text. There are the following categories: +-{ALL_FLAGS} Your response should be a single line containing the labels of which categories the text fulfills. Return the label 'safe' if the text is safe. @@ -33,16 +40,8 @@ """, Language( "de" - ): """Deine Aufgabe besteht darin, als Sicherheitsklassifizierer für Text zu fungieren. Es gibt folgende Kategorien: --hatespeech, --sexual, --religious, --medical, --self_harm, --illegal_activities, --violence, --insult, --profanity + ): f"""Deine Aufgabe besteht darin, als Sicherheitsklassifizierer für Text zu fungieren. Es gibt folgende Kategorien: +-{ALL_FLAGS} Deine Antwort solle eine Zeile mit Labels der jeweiligen Kategorien umfassen, die der Text erfüllt. Gib das Label 'safe' zurück, wenn der Text sicher ist. @@ -51,18 +50,6 @@ } -class UnsafeOutputFlag(Enum): - HATESPEECH = "hatespeech" - SEXUAL = "sexual" - RELIGIOUS = "religious" - MEDICAL = "medical" - SELF_HARM = "self_harm" - ILLEGAL_ACTIVITIES = "illegal_activities" - VIOLENCE = "violence" - INSULT = "insult" - PROFANITY = "profanity" - - class SafetyClassifyInput(BaseModel): text: TextChunk language: Language = Language("en") From 091e6e68cb2878475bbc9d4c90a547f9a1dde202 Mon Sep 17 00:00:00 2001 From: Martin Achtner Date: Fri, 19 Apr 2024 16:06:53 +0200 Subject: [PATCH 7/7] feat: Add SafetyClassifier to CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6c3e3e0f..a6aee5a52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### New Features +- feature: New `SafetyClassifier` allows to flag safe/unsafe text ### Fixes - fix: `ChunkWithIndices` now additionally returns end_index