Skip to content

Commit

Permalink
Merge pull request #20 from ljvmiranda921/add/filipino-slang-norm
Browse files Browse the repository at this point in the history
Closes #15 | Add filipino_slang_norm data loader
  • Loading branch information
SamuelCahyawijaya authored Nov 6, 2023
2 parents 32227ef + ec7d545 commit 77dfaa6
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 0 deletions.
Empty file.
136 changes: 136 additions & 0 deletions seacrowd/sea_datasets/filipino_slang_norm/filipino_slang_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
from datasets.download.download_manager import DownloadManager

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """
@inproceedings{flores-radev-2022-look,
title = "Look Ma, Only 400 Samples! Revisiting the Effectiveness of Automatic N-Gram Rule Generation for Spelling Normalization in {F}ilipino",
author = "Flores, Lorenzo Jaime and
Radev, Dragomir",
booktitle = "Proceedings of The Third Workshop on Simple and Efficient Natural Language Processing (SustaiNLP)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.sustainlp-1.5",
pages = "29--35",
}
"""

_LOCAL = False
_LANGUAGES = ["fil"]
_DATASETNAME = "filipino_slang_norm"
_DESCRIPTION = """\
This dataset contains 398 abbreviated and/or contracted Filipino words used in
Facebook comments made on weather advisories from a Philippine weather bureau.
volunteers.
"""

_HOMEPAGE = "https://github.com/ljyflores/efficient-spelling-normalization-filipino"
_LICENSE = Licenses.UNKNOWN.value
_URLS = {
"train": "https://github.com/ljyflores/efficient-spelling-normalization-filipino/raw/main/data/train_words.csv",
"test": "https://github.com/ljyflores/efficient-spelling-normalization-filipino/raw/main/data/test_words.csv",
}

_SUPPORTED_TASKS = [Tasks.MULTILEXNORM]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"


class FilipinoSlangNormDataset(datasets.GeneratorBasedBuilder):
"""Filipino Slang Norm dataset by Flores and Radev (2022)"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

SEACROWD_SCHEMA_NAME = "t2t"

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=_DATASETNAME,
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=_DATASETNAME,
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
if self.config.schema == "source":
features = datasets.Features(
{
"id": datasets.Value("string"),
"src_sent": datasets.Value("string"),
"norm_sent": datasets.Value("string"),
}
)
elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
features = schemas.text2text_features
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
data_files = {
"train": Path(dl_manager.download_and_extract(_URLS["train"])),
"test": Path(dl_manager.download_and_extract(_URLS["test"])),
}

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_files["train"],
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": data_files["test"],
"split": "test",
},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yield examples as (key, example) tuples"""
with open(filepath, encoding="utf-8") as f:
for guid, line in enumerate(f):
src_sent, norm_sent = line.strip("\n").split(",")
if self.config.schema == "source":
example = {
"id": str(guid),
"src_sent": src_sent,
"norm_sent": norm_sent,
}
elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}":
example = {
"id": str(guid),
"text_1": src_sent,
"text_2": norm_sent,
"text_1_name": "src_sent",
"text_2_name": "norm_sent",
}
yield guid, example

0 comments on commit 77dfaa6

Please sign in to comment.