Skip to content

Commit

Permalink
Merge pull request #2718 from flairNLP/GH-2717-ignore-labels
Browse files Browse the repository at this point in the history
GH-2717: Add option to ignore labels in dataset
  • Loading branch information
alanakbik authored Apr 10, 2022
2 parents 1fe18be + 78b17f2 commit dfcd35f
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
10 changes: 8 additions & 2 deletions flair/datasets/relation_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
base_path: Union[str, Path] = None,
in_memory: bool = True,
augment_train: bool = False,
**corpusargs,
):
"""
SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of
Expand Down Expand Up @@ -83,6 +84,7 @@ def __init__(
column_format={1: "text", 2: "ner"},
comment_symbol="# ",
in_memory=in_memory,
**corpusargs,
)

def extract_and_convert_to_conllu(self, data_file, data_folder, augment_train):
Expand Down Expand Up @@ -227,7 +229,7 @@ def _semeval_lines_to_token_list(self, raw_lines, augment_relations):


class RE_ENGLISH_TACRED(ColumnCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, **corpusargs):
"""
TAC Relation Extraction Dataset with 41 relations from https://nlp.stanford.edu/projects/tacred/.
Manual download is required for this dataset.
Expand Down Expand Up @@ -260,6 +262,7 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
column_format={1: "text", 2: "ner"},
comment_symbol="# ",
in_memory=in_memory,
**corpusargs,
)

def extract_and_convert_to_conllu(self, data_file, data_folder):
Expand Down Expand Up @@ -351,7 +354,7 @@ def _tacred_example_to_token_list(self, example: Dict[str, Any]) -> conllu.Token


class RE_ENGLISH_CONLL04(ColumnCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, **corpusargs):
if not base_path:
base_path = flair.cache_root / "datasets"
else:
Expand Down Expand Up @@ -385,6 +388,7 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True):
in_memory=in_memory,
column_format={1: "text", 2: "ner"},
comment_symbol="# ",
**corpusargs,
)

def _parse_incr(self, source_file) -> Iterable[conllu.TokenList]:
Expand Down Expand Up @@ -536,6 +540,7 @@ def __init__(
base_path: Union[str, Path] = None,
in_memory: bool = True,
sentence_splitter: SentenceSplitter = SegtokSentenceSplitter(),
**corpusargs,
):
"""
DrugProt corpus: Biocreative VII Track 1 from https://zenodo.org/record/5119892#.YSdSaVuxU5k/ on
Expand Down Expand Up @@ -570,6 +575,7 @@ def __init__(
sample_missing_splits=False,
column_format={1: "text", 2: "ner", 3: "ner"},
comment_symbol="# ",
**corpusargs,
)

def extract_and_convert_to_conllu(self, data_file, data_folder):
Expand Down
13 changes: 9 additions & 4 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,8 @@ def _convert_lines_to_sentence(
for span_indices, score, label in predicted_spans:
span = sentence[span_indices[0] : span_indices[-1] + 1]
value = self._remap_label(label)
span.add_label(span_level_tag_columns[span_column], value=value, score=score)
if value != "O":
span.add_label(span_level_tag_columns[span_column], value=value, score=score)
except Exception:
pass

Expand All @@ -681,7 +682,9 @@ def _convert_lines_to_sentence(
relation = Relation(
first=sentence[head_start - 1 : head_end], second=sentence[tail_start - 1 : tail_end]
)
relation.add_label(typename="relation", value=self._remap_label(label))
remapped = self._remap_label(label)
if remapped != "O":
relation.add_label(typename="relation", value=remapped)

if len(sentence) > 0:
return sentence
Expand Down Expand Up @@ -719,15 +722,17 @@ def _parse_token(self, line: str, column_name_map: Dict[int, str], last_token: O
# add each other feature as label-value pair
label_name = feature.split("=")[0]
label_value = self._remap_label(feature.split("=")[1])
token.add_label(label_name, label_value)
if label_value != "O":
token.add_label(label_name, label_value)

else:
# get the task name (e.g. 'ner')
label_name = column_name_map[column]
# get the label value
label_value = self._remap_label(fields[column])
# add label
token.add_label(label_name, label_value)
if label_value != "O":
token.add_label(label_name, label_value)

if column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == "-":
token.whitespace_after = False
Expand Down

0 comments on commit dfcd35f

Please sign in to comment.