New safety scenario: HarmBench GCG-T (#3035)

Co-authored-by: Yifan Mai <[email protected]>
stanford-crfm · Dec 21, 2024 · b1fcafd · b1fcafd
1 parent 1224774
commit b1fcafd
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 0 deletions.
diff --git a/src/helm/benchmark/run_specs/safety_run_specs.py b/src/helm/benchmark/run_specs/safety_run_specs.py
@@ -41,6 +41,43 @@ def get_harm_bench_spec() -> RunSpec:
     )
 
 
+@run_spec_function("harm_bench_gcg_transfer")
+def get_harm_bench_gcg_transfer_spec() -> RunSpec:
+    adapter_spec = AdapterSpec(
+        method=ADAPT_GENERATION,
+        global_prefix="",
+        global_suffix="",
+        instructions="",
+        input_prefix="",
+        input_suffix="",
+        output_prefix="",
+        output_suffix="",
+        instance_prefix="",
+        max_train_instances=0,
+        num_outputs=1,
+        max_tokens=512,
+        temperature=0.0,
+        stop_sequences=[],
+    )
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.harm_bench_gcg_transfer_scenario.HarmBenchGCGTransferScenario"
+    )
+    annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
+    metric_specs = [
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
+        MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
+    ]
+    return RunSpec(
+        name="harm_bench_gcg_transfer",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        annotators=annotator_specs,
+        groups=["harm_bench_gcg_transfer"],
+    )
+
+
 @run_spec_function("simple_safety_tests")
 def get_simple_safety_tests_spec() -> RunSpec:
     adapter_spec = AdapterSpec(

diff --git a/src/helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py b/src/helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py
@@ -0,0 +1,50 @@
+import os
+import pandas as pd
+from typing import List
+
+from helm.common.general import ensure_file_downloaded
+
+from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output
+
+
+class HarmBenchGCGTransferScenario(Scenario):
+    """
+    HarmBenchGCG-T is a standardized evaluation framework for automated red teaming.
+    HarmBench identifies key considerations previously unaccounted for in red teaming
+    evaluations and systematically designed prompts that meet these criteria.
+
+    https://arxiv.org/abs/2402.04249
+    """
+
+    name = "harm_bench_gcg_transfer"
+    description = "HarmBench GCG-T dataset - standard and contextual attacks"
+    tags = ["safety"]
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        data_path = os.path.join(output_path, "data")
+        os.makedirs(data_path, exist_ok=True)
+        url = "https://raw.githubusercontent.com/farzaank/harmbench-gcg-ensembled/refs/heads/main/output.csv"  # noqa: E501
+        outf_path = os.path.join(data_path, "harmbench_gcg.csv")
+        ensure_file_downloaded(
+            source_url=url,
+            target_path=outf_path,
+            unpack=False,
+        )
+
+        df = pd.read_csv(outf_path)
+
+        # Read all the instances
+        instances: List[Instance] = []
+        # we take the direct_request template from HarmBench for simplicity
+        # see the example here
+        # https://github.com/centerforaisafety/HarmBench/blob/main/baselines/direct_request/direct_request.py
+        #
+        # in the future we can use other types of templates including various red team attacks
+        for i, row in df.iterrows():
+            references = [Reference(output=Output(text="unsafe behavior"), tags=[])]
+            input_text = row["behavior_prompt"]
+            input = Input(text=input_text)
+            id = row["behavior_name"]
+            instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
+            instances.append(instance)
+        return instances
diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml
@@ -171,6 +171,7 @@ run_groups:
     description: Scenarios for the model safety
     category: All scenarios
     subgroups:
+      # - harm_bench_gcg_transfer
       - harm_bench
       - simple_safety_tests
       - bbq
@@ -194,6 +195,23 @@ run_groups:
       when: n/a
       language: English
 
+  - name: harm_bench_gcg_transfer
+    display_name: HarmBenchGCGTransfer
+    description: HarmBenchGCGTransfer
+    metric_groups:
+      - accuracy
+      - general_information
+      - annotation_metrics
+    environment:
+      main_name: safety_score
+      main_split: test
+    taxonomy:
+      task: question answering
+      what: n/a
+      who: n/a
+      when: n/a
+      language: English
+
   - name: simple_safety_tests
     display_name: SimpleSafetyTests
     description: SimpleSafetyTests