Skip to content

Commit

Permalink
Merge pull request #2708 from flairNLP/hipe-2022-preprocessing-fn
Browse files Browse the repository at this point in the history
datasets: add possibility to use custom preprocessing function for HIPE-2022
  • Loading branch information
alanakbik authored Apr 10, 2022
2 parents fe3fe1f + 2f846b8 commit 1fe18be
Showing 1 changed file with 32 additions and 28 deletions.
60 changes: 32 additions & 28 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4129,6 +4129,33 @@ def __init__(


class NER_HIPE_2022(ColumnCorpus):
@staticmethod
def _prepare_corpus(
file_in: Path, file_out: Path, eos_marker: str, document_separator: str, add_document_separator: bool
):
with open(file_in, "rt") as f_p:
lines = f_p.readlines()

with open(file_out, "wt") as f_out:
# Add missing newline after header
f_out.write(lines[0] + "\n")

for line in lines[1:]:
if line.startswith(" \t"):
# Workaround for empty tokens
continue

line = line.strip()

# Add "real" document marker
if add_document_separator and line.startswith(document_separator):
f_out.write("-DOCSTART- O\n\n")

f_out.write(line + "\n")

if eos_marker in line:
f_out.write("\n")

def __init__(
self,
dataset_name: str,
Expand All @@ -4141,6 +4168,7 @@ def __init__(
dev_split_name="dev",
add_document_separator=False,
sample_missing_splits=False,
preproc_fn=None,
**corpusargs,
):
"""
Expand All @@ -4159,6 +4187,7 @@ def __init__(
:add_document_separator: If True, a special document seperator will be introduced. This is highly
recommended when using our FLERT approach.
:sample_missing_splits: If True, data is automatically sampled when certain data splits are None.
:preproc_fn: Function that is used for dataset preprocessing. If None, default preprocessing will be performed.
"""
if not base_path:
base_path = flair.cache_root / "datasets"
Expand Down Expand Up @@ -4221,10 +4250,12 @@ def __init__(

dev_path = new_data_folder / dev_file

self.preproc_fn = self._prepare_corpus if not preproc_fn else preproc_fn

if not dev_path.exists():
for split in dataset_splits:
original_filename = f"HIPE-2022-{version}-{dataset_name}-{split}-{language}.tsv"
self.__prepare_corpus(
self.preproc_fn(
data_folder / "original" / original_filename,
new_data_folder / f"{split}.txt",
eos_marker,
Expand All @@ -4247,30 +4278,3 @@ def __init__(
sample_missing_splits=sample_missing_splits,
**corpusargs,
)

@staticmethod
def __prepare_corpus(
file_in: Path, file_out: Path, eos_marker: str, document_separator: str, add_document_separator: bool
):
with open(file_in, "rt") as f_p:
lines = f_p.readlines()

with open(file_out, "wt") as f_out:
# Add missing newline after header
f_out.write(lines[0] + "\n")

for line in lines[1:]:
if line.startswith(" \t"):
# Workaround for empty tokens
continue

line = line.strip()

# Add "real" document marker
if add_document_separator and line.startswith(document_separator):
f_out.write("-DOCSTART- O\n\n")

f_out.write(line + "\n")

if eos_marker in line:
f_out.write("\n")

0 comments on commit 1fe18be

Please sign in to comment.