Skip to content

Commit

Permalink
New safety scenario: HarmBench GCG-T (#3035)
Browse files Browse the repository at this point in the history
Co-authored-by: Yifan Mai <[email protected]>
  • Loading branch information
farzaank and yifanmai authored Dec 21, 2024
1 parent 1224774 commit b1fcafd
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 0 deletions.
37 changes: 37 additions & 0 deletions src/helm/benchmark/run_specs/safety_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,43 @@ def get_harm_bench_spec() -> RunSpec:
)


@run_spec_function("harm_bench_gcg_transfer")
def get_harm_bench_gcg_transfer_spec() -> RunSpec:
adapter_spec = AdapterSpec(
method=ADAPT_GENERATION,
global_prefix="",
global_suffix="",
instructions="",
input_prefix="",
input_suffix="",
output_prefix="",
output_suffix="",
instance_prefix="",
max_train_instances=0,
num_outputs=1,
max_tokens=512,
temperature=0.0,
stop_sequences=[],
)
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.harm_bench_gcg_transfer_scenario.HarmBenchGCGTransferScenario"
)
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
metric_specs = [
MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
]
return RunSpec(
name="harm_bench_gcg_transfer",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
annotators=annotator_specs,
groups=["harm_bench_gcg_transfer"],
)


@run_spec_function("simple_safety_tests")
def get_simple_safety_tests_spec() -> RunSpec:
adapter_spec = AdapterSpec(
Expand Down
50 changes: 50 additions & 0 deletions src/helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import pandas as pd
from typing import List

from helm.common.general import ensure_file_downloaded

from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output


class HarmBenchGCGTransferScenario(Scenario):
"""
HarmBenchGCG-T is a standardized evaluation framework for automated red teaming.
HarmBench identifies key considerations previously unaccounted for in red teaming
evaluations and systematically designed prompts that meet these criteria.
https://arxiv.org/abs/2402.04249
"""

name = "harm_bench_gcg_transfer"
description = "HarmBench GCG-T dataset - standard and contextual attacks"
tags = ["safety"]

def get_instances(self, output_path: str) -> List[Instance]:
data_path = os.path.join(output_path, "data")
os.makedirs(data_path, exist_ok=True)
url = "https://raw.githubusercontent.com/farzaank/harmbench-gcg-ensembled/refs/heads/main/output.csv" # noqa: E501
outf_path = os.path.join(data_path, "harmbench_gcg.csv")
ensure_file_downloaded(
source_url=url,
target_path=outf_path,
unpack=False,
)

df = pd.read_csv(outf_path)

# Read all the instances
instances: List[Instance] = []
# we take the direct_request template from HarmBench for simplicity
# see the example here
# https://github.com/centerforaisafety/HarmBench/blob/main/baselines/direct_request/direct_request.py
#
# in the future we can use other types of templates including various red team attacks
for i, row in df.iterrows():
references = [Reference(output=Output(text="unsafe behavior"), tags=[])]
input_text = row["behavior_prompt"]
input = Input(text=input_text)
id = row["behavior_name"]
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
instances.append(instance)
return instances
18 changes: 18 additions & 0 deletions src/helm/benchmark/static/schema_safety.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ run_groups:
description: Scenarios for the model safety
category: All scenarios
subgroups:
# - harm_bench_gcg_transfer
- harm_bench
- simple_safety_tests
- bbq
Expand All @@ -194,6 +195,23 @@ run_groups:
when: n/a
language: English

- name: harm_bench_gcg_transfer
display_name: HarmBenchGCGTransfer
description: HarmBenchGCGTransfer
metric_groups:
- accuracy
- general_information
- annotation_metrics
environment:
main_name: safety_score
main_split: test
taxonomy:
task: question answering
what: n/a
who: n/a
when: n/a
language: English

- name: simple_safety_tests
display_name: SimpleSafetyTests
description: SimpleSafetyTests
Expand Down

0 comments on commit b1fcafd

Please sign in to comment.