Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #29 | Add dataset loader for IJELID #45

Merged
merged 2 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
142 changes: 142 additions & 0 deletions seacrowd/sea_datasets/ijelid/ijelid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from pathlib import Path
from typing import Dict, List, Tuple

import datasets
from datasets.download.download_manager import DownloadManager

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses, Tasks

_CITATION = """
@article{hidayatullah2023corpus,
title={Corpus creation and language identification for code-mixed Indonesian-Javanese-English Tweets},
author={Hidayatullah, Ahmad Fathan and Apong, Rosyzie Anna and Lai, Daphne TC and Qazi, Atika},
journal={PeerJ Computer Science},
volume={9},
pages={e1312},
year={2023},
publisher={PeerJ Inc.}
}
"""

_LOCAL = False
_LANGUAGES = ["ind", "jav", "eng"]
_DATASETNAME = "ijelid"
_DESCRIPTION = """\
This is a code-mixed Indonesian-Javanese-English dataset for token-level
language identification. We named this dataset as IJELID
(Indonesian-Javanese-English Language Identification). This dataset contains
tweets that have been tokenized with the corresponding token and its language
label. There are seven language labels in the dataset, namely: ID (Indonesian)JV
(Javanese), EN (English), MIX_ID_EN (mixed Indonesian-English), MIX_ID_JV (mixed
Indonesian-Javanese), MIX_JV_EN (mixed Javanese-English), OTH (Other).
"""

_HOMEPAGE = "https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"
_LICENSE = Licenses.CC_BY_NC_SA_4_0.value
_URLS = {
"train": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/train.tsv",
"dev": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/val.tsv",
"test": "https://raw.githubusercontent.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data/main/test.tsv",
}

_SUPPORTED_TASKS = [Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION]
_SOURCE_VERSION = "1.0.0"
_SEACROWD_VERSION = "1.0.0"


class IJELIDDataset(datasets.GeneratorBasedBuilder):
"""IJELID dataset from https://github.com/fathanick/Code-mixed-Indonesian-Javanese-English-Twitter-Data"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)

SEACROWD_SCHEMA_NAME = "seq_label"
LABEL_CLASSES = ["ID", "JV", "EN", "MIX_ID_EN", "MIX_ID_JV", "MIX_JV_EN", "OTH"]

BUILDER_CONFIGS = [
SEACrowdConfig(
name=f"{_DATASETNAME}_source",
version=SOURCE_VERSION,
description=f"{_DATASETNAME} source schema",
schema="source",
subset_id=_DATASETNAME,
),
SEACrowdConfig(
name=f"{_DATASETNAME}_seacrowd_{SEACROWD_SCHEMA_NAME}",
version=SEACROWD_VERSION,
description=f"{_DATASETNAME} SEACrowd schema",
schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}",
subset_id=_DATASETNAME,
),
]

DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"

def _info(self) -> datasets.DatasetInfo:
# No specific schema for the source, so for consistency,
# I will use the same schema with SEACrowd
features = schemas.seq_label_features(self.LABEL_CLASSES)

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager: DownloadManager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
data_files = {
"train": Path(dl_manager.download_and_extract(_URLS["train"])),
"dev": Path(dl_manager.download_and_extract(_URLS["dev"])),
"test": Path(dl_manager.download_and_extract(_URLS["test"])),
}

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": data_files["train"], "split": "train"},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": data_files["dev"], "split": "dev"},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": data_files["test"], "split": "test"},
),
]

def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
"""Yield examples as (key, example) tuples"""
with open(filepath, encoding="utf-8") as f:
guid = 0
tokens = []
labels = []
for line in f:
if line == "" or line == "\n":
if tokens:
yield guid, {
"id": str(guid),
"tokens": tokens,
"labels": labels,
}
guid += 1
tokens = []
labels = []
else:
# IJELID TSV are separated by \t
token, label = line.split("\t")
tokens.append(token)
labels.append(label.rstrip())

# Last example
if tokens:
yield guid, {
"id": str(guid),
"tokens": tokens,
"labels": labels,
}
2 changes: 2 additions & 0 deletions seacrowd/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class Tasks(Enum):
KEYWORD_TAGGING = "KT"
NAMED_ENTITY_RECOGNITION = "NER"
SENTENCE_ORDERING = "SO"
TOKEN_LEVEL_LANGUAGE_IDENTIFICATION = "LANGID"

# Pair Text Classification
QUESTION_ANSWERING = "QA"
Expand Down Expand Up @@ -182,6 +183,7 @@ class Licenses(Enum):
Tasks.POS_TAGGING: "SEQ_LABEL",
Tasks.KEYWORD_TAGGING: "SEQ_LABEL",
Tasks.SENTENCE_ORDERING: "SEQ_LABEL",
Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
Tasks.QUESTION_ANSWERING: "QA",
Tasks.TEXTUAL_ENTAILMENT: "PAIRS",
Tasks.SEMANTIC_SIMILARITY: "PAIRS_SCORE",
Expand Down