-
Notifications
You must be signed in to change notification settings - Fork 265
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4b8dc91
commit 9839a8c
Showing
4 changed files
with
268 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec | ||
from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs | ||
from helm.benchmark.run_spec import RunSpec, run_spec_function | ||
from helm.benchmark.scenarios.scenario import ScenarioSpec | ||
|
||
|
||
@run_spec_function("tweetsentbr") | ||
def get_tweetsentbr_spec() -> RunSpec: | ||
scenario_spec = ScenarioSpec( | ||
class_name="helm.benchmark.scenarios.tweetsentbr_scenario.TweetSentBRScenario", args={} | ||
) | ||
|
||
adapter_spec = get_generation_adapter_spec( | ||
instructions="""Classifique o tweet como "Positivo", "Neutro" ou "Negativo". | ||
Tweet: vocês viram a novela hoje? | ||
Classe: Neutro | ||
Tweet: que vontade de comer pizza | ||
Classe: Neutro | ||
""", | ||
input_noun="Tweet", | ||
output_noun="Classe", | ||
) | ||
|
||
return RunSpec( | ||
name="tweetsentbr", | ||
scenario_spec=scenario_spec, | ||
adapter_spec=adapter_spec, | ||
metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), | ||
groups=["tweetsentbr"], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import pytest | ||
from tempfile import TemporaryDirectory | ||
|
||
from helm.benchmark.scenarios.tweetsentbr_scenario import TweetSentBRScenario | ||
from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference | ||
|
||
|
||
@pytest.mark.scenarios | ||
def test_tweetsentbr_scenario(): | ||
tweetsentbr = TweetSentBRScenario() | ||
with TemporaryDirectory() as tmpdir: | ||
instances = tweetsentbr.get_instances(tmpdir) | ||
assert len(instances) == 2085 | ||
assert instances[0].split == TRAIN_SPLIT | ||
|
||
assert instances[0].input.text.startswith("joca tá com a corda toda 😂 😂 😂 😂") | ||
assert len(instances[0].input.text) == 32 | ||
|
||
assert instances[0].references == [ | ||
Reference( | ||
output=Output(text="Positivo"), | ||
tags=[CORRECT_TAG], | ||
) | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
from typing import Any, List, Dict | ||
from pathlib import Path | ||
from datasets import load_dataset | ||
from helm.common.hierarchical_logger import hlog | ||
from helm.benchmark.scenarios.scenario import ( | ||
Scenario, | ||
Instance, | ||
Reference, | ||
TRAIN_SPLIT, | ||
TEST_SPLIT, | ||
CORRECT_TAG, | ||
Input, | ||
Output, | ||
) | ||
|
||
|
||
class TweetSentBRScenario(Scenario): | ||
""" | ||
TweetSentBR is a corpus of Tweets in Brazilian Portuguese. It was labeled by several | ||
annotators following steps stablished on the literature for improving reliability on | ||
the task of Sentiment Analysis. Each Tweet was annotated in one of the three following classes: | ||
Positive - tweets where a user meant a positive reaction or evaluation about the main topic on the post; | ||
Negative - tweets where a user meant a negative reaction or evaluation about the main topic on the post; | ||
Neutral - tweets not belonging to any of the last classes, usually not making a point, out of topic, | ||
irrelevant, confusing or containing only objective data. | ||
This dataset is a subset of the tweetSentBR, it contains only 75 samples from the training set | ||
and all 2.000+ instances of the test set. This is meant for evaluating language models in a few-shot setting. | ||
""" | ||
|
||
name = "simple_classification" | ||
description = "Classify tweets into Positive, Negative or Neutral." | ||
tags = ["classification"] | ||
|
||
def process_dataset(self, dataset: Any, split: str) -> List[Instance]: | ||
instances: List[Instance] = [] | ||
label_names = {"Positive": "Positivo", "Negative": "Negativo", "Neutral": "Neutro"} | ||
for example in dataset[split]: | ||
input = Input(text=example["sentence"]) | ||
# NOTE: For classification scenarios, the reference outputs should be the same | ||
# for all instances, and should include both correct and incorrect classes. | ||
# HELM only supports single-label classification. Exactly one reference | ||
# should have the CORRECT_TAG tag. | ||
references = [ | ||
Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]), | ||
] | ||
instance = Instance(input=input, references=references, split=split) | ||
instances.append(instance) | ||
return instances | ||
|
||
def get_instances(self, output_path: str) -> List[Instance]: | ||
instances: List[Instance] = [] | ||
cache_dir = str(Path(output_path) / "data") | ||
dataset = load_dataset("eduagarcia/tweetsentbr_fewshot", cache_dir=cache_dir) | ||
splits: Dict[str, str] = { | ||
"train": TRAIN_SPLIT, | ||
"test": TEST_SPLIT, | ||
} | ||
for split in splits: | ||
if split not in splits.keys(): | ||
hlog(f"{split} split doesn't exist, skipping") | ||
continue | ||
instances.extend(self.process_dataset(dataset, splits[split])) | ||
|
||
return instances |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
############################################################ | ||
metrics: | ||
# Infrastructure metrics: | ||
- name: num_perplexity_tokens | ||
display_name: '# tokens' | ||
description: Average number of tokens in the predicted output (for language modeling, the input too). | ||
- name: num_bytes | ||
display_name: '# bytes' | ||
description: Average number of bytes in the predicted output (for language modeling, the input too). | ||
|
||
- name: num_references | ||
display_name: '# ref' | ||
description: Number of references. | ||
- name: num_train_trials | ||
display_name: '# trials' | ||
description: Number of trials, where in each trial we choose an independent, random set of training instances. | ||
- name: estimated_num_tokens_cost | ||
display_name: 'cost' | ||
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request. | ||
- name: num_prompt_tokens | ||
display_name: '# prompt tokens' | ||
description: Number of tokens in the prompt. | ||
- name: num_prompt_characters | ||
display_name: '# prompt chars' | ||
description: Number of characters in the prompt. | ||
- name: num_completion_tokens | ||
display_name: '# completion tokens' | ||
description: Actual number of completion tokens (over all completions). | ||
- name: num_output_tokens | ||
display_name: '# output tokens' | ||
description: Actual number of output tokens. | ||
- name: max_num_output_tokens | ||
display_name: 'Max output tokens' | ||
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences). | ||
- name: num_requests | ||
display_name: '# requests' | ||
description: Number of distinct API requests. | ||
- name: num_instances | ||
display_name: '# eval' | ||
description: Number of evaluation instances. | ||
- name: num_train_instances | ||
display_name: '# train' | ||
description: Number of training instances (e.g., in-context examples). | ||
- name: prompt_truncated | ||
display_name: truncated | ||
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples). | ||
- name: finish_reason_length | ||
display_name: finish b/c length | ||
description: Fraction of instances where the the output was terminated because of the max tokens limit. | ||
- name: finish_reason_stop | ||
display_name: finish b/c stop | ||
description: Fraction of instances where the the output was terminated because of the stop sequences. | ||
- name: finish_reason_endoftext | ||
display_name: finish b/c endoftext | ||
description: Fraction of instances where the the output was terminated because the end of text token was generated. | ||
- name: finish_reason_unknown | ||
display_name: finish b/c unknown | ||
description: Fraction of instances where the the output was terminated for unknown reasons. | ||
- name: num_completions | ||
display_name: '# completions' | ||
description: Number of completions. | ||
- name: predicted_index | ||
display_name: Predicted index | ||
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). | ||
|
||
# Accuracy metrics: | ||
- name: exact_match | ||
display_name: Exact match | ||
short_display_name: EM | ||
description: Fraction of instances that the predicted output matches a correct reference exactly. | ||
lower_is_better: false | ||
- name: quasi_exact_match | ||
display_name: Quasi-exact match | ||
short_display_name: EM | ||
description: Fraction of instances that the predicted output matches a correct reference up to light processing. | ||
lower_is_better: false | ||
- name: prefix_exact_match | ||
display_name: Prefix exact match | ||
short_display_name: PEM | ||
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly. | ||
lower_is_better: false | ||
- name: quasi_prefix_exact_match | ||
# TODO: should call this prefix_quasi_exact_match | ||
display_name: Prefix quasi-exact match | ||
short_display_name: PEM | ||
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing. | ||
lower_is_better: false | ||
|
||
|
||
############################################################ | ||
perturbations: [] | ||
|
||
############################################################ | ||
metric_groups: | ||
- name: accuracy | ||
display_name: Accuracy | ||
metrics: | ||
- name: ${main_name} | ||
split: ${main_split} | ||
|
||
- name: efficiency | ||
display_name: Efficiency | ||
metrics: | ||
- name: inference_runtime | ||
split: ${main_split} | ||
|
||
- name: general_information | ||
display_name: General information | ||
hide_win_rates: true | ||
metrics: | ||
- name: num_instances | ||
split: ${main_split} | ||
- name: num_train_instances | ||
split: ${main_split} | ||
- name: prompt_truncated | ||
split: ${main_split} | ||
- name: num_prompt_tokens | ||
split: ${main_split} | ||
- name: num_output_tokens | ||
split: ${main_split} | ||
|
||
############################################################ | ||
run_groups: | ||
- name: core_scenarios | ||
display_name: Core Scenarios | ||
description: Core Scenarios | ||
category: All scenarios | ||
subgroups: | ||
- tweetsentbr | ||
|
||
- name: tweetsentbr | ||
display_name: TweetSentBR | ||
description: TweetSentBR | ||
metric_groups: | ||
- accuracy | ||
- efficiency | ||
- general_information | ||
environment: | ||
main_name: exact_match | ||
main_split: test | ||
taxonomy: | ||
task: "text classification" | ||
what: "tweets with sentiments" | ||
who: "?" | ||
when: "2018" | ||
language: Portuguese |