Skip to content

Commit

Permalink
Closes SEACrowd#14 | Create dataset loader for ara-close-lange (SEACr…
Browse files Browse the repository at this point in the history
…owd#243)

* Add ara_close dataloader

* Rename class name to AraCloseDataset
  • Loading branch information
MJonibek authored and Railey Montalan committed Feb 28, 2024
1 parent b9bcd2d commit 084f228
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 0 deletions.
Empty file.
194 changes: 194 additions & 0 deletions seacrowd/sea_datasets/ara_close/ara_close.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" \
The dataset contribution of this study is a compilation of short fictional stories \
written in Bikol for readability assessment. The data was combined other collected \
Philippine language corpora, such as Tagalog and Cebuano. The data from these languages \
are all distributed across the Philippine elementary system's first three grade \
levels (L1, L2, L3). We sourced this dataset from Let's Read Asia (LRA), Bloom Library, \
Department of Education, and Adarna House.
"""

from pathlib import Path
from typing import Dict, List, Tuple

import datasets

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """\
@inproceedings{imperial-kochmar-2023-automatic,
title = "Automatic Readability Assessment for Closely Related Languages",
author = "Imperial, Joseph Marvin and
Kochmar, Ekaterina",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.331",
doi = "10.18653/v1/2023.findings-acl.331",
pages = "5371--5386",
abstract = "In recent years, the main focus of research on automatic readability assessment (ARA) \
has shifted towards using expensive deep learning-based methods with the primary goal of increasing models{'} accuracy. \
This, however, is rarely applicable for low-resource languages where traditional handcrafted features are still \
widely used due to the lack of existing NLP tools to extract deeper linguistic representations. In this work, \
we take a step back from the technical component and focus on how linguistic aspects such as mutual intelligibility \
or degree of language relatedness can improve ARA in a low-resource setting. We collect short stories written in three \
languages in the Philippines{---}Tagalog, Bikol, and Cebuano{---}to train readability assessment models and explore the \
interaction of data and features in various cross-lingual setups. Our results show that the inclusion of CrossNGO, \
a novel specialized feature exploiting n-gram overlap applied to languages with high mutual intelligibility, \
significantly improves the performance of ARA models compared to the use of off-the-shelf large multilingual \
language models alone. Consequently, when both linguistic representations are combined, we achieve state-of-the-art \
results for Tagalog and Cebuano, and baseline scores for ARA in Bikol.",
}
"""

_DATASETNAME = "ara_close"

_DESCRIPTION = """\
The dataset contribution of this study is a compilation of short fictional stories \
written in Bikol for readability assessment. The data was combined other collected \
Philippine language corpora, such as Tagalog and Cebuano. The data from these languages \
are all distributed across the Philippine elementary system's first three grade \
levels (L1, L2, L3). We sourced this dataset from Let's Read Asia (LRA), Bloom Library, \
Department of Education, and Adarna House. \
"""

_HOMEPAGE = "https://github.com/imperialite/ara-close-lang"

_LANGUAGES = ["bcl", "ceb"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)

_LICENSE = Licenses.CC_BY_4_0.value # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value

_LOCAL = False

_URLS = {
"bcl": "https://raw.githubusercontent.com/imperialite/ara-close-lang/main/data/bikol/bik_all_data.txt",
# 'tgl': '', # file for tgl language was deleted
"ceb": "https://raw.githubusercontent.com/imperialite/ara-close-lang/main/data/cebuano/ceb_all_data.txt",
}

_SUPPORTED_TASKS = [Tasks.READABILITY_ASSESSMENT]

_SOURCE_VERSION = "1.0.0"

_SEACROWD_VERSION = "1.0.0"


class AraCloseDataset(datasets.GeneratorBasedBuilder):
f"""{_DESCRIPTION}"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

BUILDER_CONFIGS = [SEACrowdConfig(name=f"{_DATASETNAME}_{lang}_source", version=datasets.Version(_SOURCE_VERSION), description=f"{_DATASETNAME} source schema", schema="source", subset_id=f"{_DATASETNAME}",) for lang in _LANGUAGES] + [
SEACrowdConfig(
name=f"{_DATASETNAME}_{lang}_seacrowd_text",
version=datasets.Version(_SEACROWD_VERSION),
description=f"{_DATASETNAME} SEACrowd schema",
schema="seacrowd_text",
subset_id=f"{_DATASETNAME}",
)
for lang in _LANGUAGES
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
features = datasets.Features(
{
"title": datasets.Value("string"),
"text": datasets.Value("string"),
"label": datasets.Value("string"),
}
)

elif self.config.schema == "seacrowd_text":
features = schemas.text_features(["1", "2", "3"])

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

lang = self.config.name.split("_")[2]
if lang in _LANGUAGES:
data_path = Path(dl_manager.download_and_extract(_URLS[lang]))
else:
data_path = [Path(dl_manager.download_and_extract(_URLS[lang])) for lang in _LANGUAGES]

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": data_path,
"split": "train",
},
)
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
lang = self.config.name.split("_")[2]
if lang in _LANGUAGES:
file_content = open(filepath, "r").readlines()
else:
file_content = []
for path in filepath:
lines = open(path, "r").readlines()
file_content.extend(lines)

if self.config.schema == "source":
idx = 0
for line in file_content:
split_data = line.strip().split(",")
title = split_data[0]
label = split_data[1]
text = ",".join(split_data[2:])
ex = {"title": title, "text": text, "label": label}
yield idx, ex
idx += 1

elif self.config.schema == "seacrowd_text":
idx = 0
for line in file_content:
split_data = line.strip().split(",")
title = split_data[0]
label = split_data[1]
text = ",".join(split_data[2:])
ex = {
"id": idx,
"text": text,
"label": label,
}
yield idx, ex
idx += 1
else:
raise ValueError(f"Invalid config: {self.config.name}")

0 comments on commit 084f228

Please sign in to comment.