From 9839a8c365c2f3f2997d108dc6fa4a5e2c8c4952 Mon Sep 17 00:00:00 2001 From: Thallyson Alves Date: Fri, 20 Dec 2024 17:07:18 -0300 Subject: [PATCH] Adding TweetSentBR Scenario (#3219) --- .../run_specs/tweetsentbr_run_specs.py | 32 ++++ .../scenarios/test_tweetsentbr_scenario.py | 24 +++ .../scenarios/tweetsentbr_scenario.py | 66 ++++++++ .../benchmark/static/schema_tweetsentbr.yaml | 146 ++++++++++++++++++ 4 files changed, 268 insertions(+) create mode 100644 src/helm/benchmark/run_specs/tweetsentbr_run_specs.py create mode 100644 src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py create mode 100644 src/helm/benchmark/scenarios/tweetsentbr_scenario.py create mode 100644 src/helm/benchmark/static/schema_tweetsentbr.yaml diff --git a/src/helm/benchmark/run_specs/tweetsentbr_run_specs.py b/src/helm/benchmark/run_specs/tweetsentbr_run_specs.py new file mode 100644 index 00000000000..535cc7903d4 --- /dev/null +++ b/src/helm/benchmark/run_specs/tweetsentbr_run_specs.py @@ -0,0 +1,32 @@ +from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec +from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.scenarios.scenario import ScenarioSpec + + +@run_spec_function("tweetsentbr") +def get_tweetsentbr_spec() -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.tweetsentbr_scenario.TweetSentBRScenario", args={} + ) + + adapter_spec = get_generation_adapter_spec( + instructions="""Classifique o tweet como "Positivo", "Neutro" ou "Negativo". + + Tweet: vocรชs viram a novela hoje? + Classe: Neutro + + Tweet: que vontade de comer pizza + Classe: Neutro + """, + input_noun="Tweet", + output_noun="Classe", + ) + + return RunSpec( + name="tweetsentbr", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), + groups=["tweetsentbr"], + ) diff --git a/src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py b/src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py new file mode 100644 index 00000000000..a7485c61784 --- /dev/null +++ b/src/helm/benchmark/scenarios/test_tweetsentbr_scenario.py @@ -0,0 +1,24 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.tweetsentbr_scenario import TweetSentBRScenario +from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference + + +@pytest.mark.scenarios +def test_tweetsentbr_scenario(): + tweetsentbr = TweetSentBRScenario() + with TemporaryDirectory() as tmpdir: + instances = tweetsentbr.get_instances(tmpdir) + assert len(instances) == 2085 + assert instances[0].split == TRAIN_SPLIT + + assert instances[0].input.text.startswith("joca tรก com a corda toda ๐Ÿ˜‚ ๐Ÿ˜‚ ๐Ÿ˜‚ ๐Ÿ˜‚") + assert len(instances[0].input.text) == 32 + + assert instances[0].references == [ + Reference( + output=Output(text="Positivo"), + tags=[CORRECT_TAG], + ) + ] diff --git a/src/helm/benchmark/scenarios/tweetsentbr_scenario.py b/src/helm/benchmark/scenarios/tweetsentbr_scenario.py new file mode 100644 index 00000000000..10e46a117a0 --- /dev/null +++ b/src/helm/benchmark/scenarios/tweetsentbr_scenario.py @@ -0,0 +1,66 @@ +from typing import Any, List, Dict +from pathlib import Path +from datasets import load_dataset +from helm.common.hierarchical_logger import hlog +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) + + +class TweetSentBRScenario(Scenario): + """ + TweetSentBR is a corpus of Tweets in Brazilian Portuguese. It was labeled by several + annotators following steps stablished on the literature for improving reliability on + the task of Sentiment Analysis. Each Tweet was annotated in one of the three following classes: + + Positive - tweets where a user meant a positive reaction or evaluation about the main topic on the post; + Negative - tweets where a user meant a negative reaction or evaluation about the main topic on the post; + Neutral - tweets not belonging to any of the last classes, usually not making a point, out of topic, + irrelevant, confusing or containing only objective data. + + This dataset is a subset of the tweetSentBR, it contains only 75 samples from the training set + and all 2.000+ instances of the test set. This is meant for evaluating language models in a few-shot setting. + """ + + name = "simple_classification" + description = "Classify tweets into Positive, Negative or Neutral." + tags = ["classification"] + + def process_dataset(self, dataset: Any, split: str) -> List[Instance]: + instances: List[Instance] = [] + label_names = {"Positive": "Positivo", "Negative": "Negativo", "Neutral": "Neutro"} + for example in dataset[split]: + input = Input(text=example["sentence"]) + # NOTE: For classification scenarios, the reference outputs should be the same + # for all instances, and should include both correct and incorrect classes. + # HELM only supports single-label classification. Exactly one reference + # should have the CORRECT_TAG tag. + references = [ + Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]), + ] + instance = Instance(input=input, references=references, split=split) + instances.append(instance) + return instances + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + cache_dir = str(Path(output_path) / "data") + dataset = load_dataset("eduagarcia/tweetsentbr_fewshot", cache_dir=cache_dir) + splits: Dict[str, str] = { + "train": TRAIN_SPLIT, + "test": TEST_SPLIT, + } + for split in splits: + if split not in splits.keys(): + hlog(f"{split} split doesn't exist, skipping") + continue + instances.extend(self.process_dataset(dataset, splits[split])) + + return instances diff --git a/src/helm/benchmark/static/schema_tweetsentbr.yaml b/src/helm/benchmark/static/schema_tweetsentbr.yaml new file mode 100644 index 00000000000..eb721ffd48c --- /dev/null +++ b/src/helm/benchmark/static/schema_tweetsentbr.yaml @@ -0,0 +1,146 @@ +############################################################ +metrics: + # Infrastructure metrics: + - name: num_perplexity_tokens + display_name: '# tokens' + description: Average number of tokens in the predicted output (for language modeling, the input too). + - name: num_bytes + display_name: '# bytes' + description: Average number of bytes in the predicted output (for language modeling, the input too). + + - name: num_references + display_name: '# ref' + description: Number of references. + - name: num_train_trials + display_name: '# trials' + description: Number of trials, where in each trial we choose an independent, random set of training instances. + - name: estimated_num_tokens_cost + display_name: 'cost' + description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request. + - name: num_prompt_tokens + display_name: '# prompt tokens' + description: Number of tokens in the prompt. + - name: num_prompt_characters + display_name: '# prompt chars' + description: Number of characters in the prompt. + - name: num_completion_tokens + display_name: '# completion tokens' + description: Actual number of completion tokens (over all completions). + - name: num_output_tokens + display_name: '# output tokens' + description: Actual number of output tokens. + - name: max_num_output_tokens + display_name: 'Max output tokens' + description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences). + - name: num_requests + display_name: '# requests' + description: Number of distinct API requests. + - name: num_instances + display_name: '# eval' + description: Number of evaluation instances. + - name: num_train_instances + display_name: '# train' + description: Number of training instances (e.g., in-context examples). + - name: prompt_truncated + display_name: truncated + description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples). + - name: finish_reason_length + display_name: finish b/c length + description: Fraction of instances where the the output was terminated because of the max tokens limit. + - name: finish_reason_stop + display_name: finish b/c stop + description: Fraction of instances where the the output was terminated because of the stop sequences. + - name: finish_reason_endoftext + display_name: finish b/c endoftext + description: Fraction of instances where the the output was terminated because the end of text token was generated. + - name: finish_reason_unknown + display_name: finish b/c unknown + description: Fraction of instances where the the output was terminated for unknown reasons. + - name: num_completions + display_name: '# completions' + description: Number of completions. + - name: predicted_index + display_name: Predicted index + description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). + + # Accuracy metrics: + - name: exact_match + display_name: Exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference exactly. + lower_is_better: false + - name: quasi_exact_match + display_name: Quasi-exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference up to light processing. + lower_is_better: false + - name: prefix_exact_match + display_name: Prefix exact match + short_display_name: PEM + description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly. + lower_is_better: false + - name: quasi_prefix_exact_match + # TODO: should call this prefix_quasi_exact_match + display_name: Prefix quasi-exact match + short_display_name: PEM + description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. + lower_is_better: false + + +############################################################ +perturbations: [] + +############################################################ +metric_groups: + - name: accuracy + display_name: Accuracy + metrics: + - name: ${main_name} + split: ${main_split} + + - name: efficiency + display_name: Efficiency + metrics: + - name: inference_runtime + split: ${main_split} + + - name: general_information + display_name: General information + hide_win_rates: true + metrics: + - name: num_instances + split: ${main_split} + - name: num_train_instances + split: ${main_split} + - name: prompt_truncated + split: ${main_split} + - name: num_prompt_tokens + split: ${main_split} + - name: num_output_tokens + split: ${main_split} + +############################################################ +run_groups: + - name: core_scenarios + display_name: Core Scenarios + description: Core Scenarios + category: All scenarios + subgroups: + - tweetsentbr + + - name: tweetsentbr + display_name: TweetSentBR + description: TweetSentBR + metric_groups: + - accuracy + - efficiency + - general_information + environment: + main_name: exact_match + main_split: test + taxonomy: + task: "text classification" + what: "tweets with sentiments" + who: "?" + when: "2018" + language: Portuguese