diff --git a/seacrowd/sea_datasets/lexitron/__init__.py b/seacrowd/sea_datasets/lexitron/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/lexitron/lexitron.py b/seacrowd/sea_datasets/lexitron/lexitron.py new file mode 100644 index 000000000..9834abb8b --- /dev/null +++ b/seacrowd/sea_datasets/lexitron/lexitron.py @@ -0,0 +1,295 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Corpus-based dictionary of Thai and English languages. \ + This dataset contains frequently-used words from trusted \ + publications such as novels, academic documents and newspaper. \ + The dataset link contains Thai-English and English-Thai lexicons. \ + Thai-English vocabulary consists of vocabulary, type of word \ + (part of speech), translation, synonym (synonym) and sample sentences \ + with a list of Thai-> English words, 53,000 words and English vocabulary \ + list -> Thai, 83,000 words. +""" +import os +import re +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses, Tasks + +# There are no citations available for this dataset. +_CITATION = "" + +_DATASETNAME = "lexitron" + +_DESCRIPTION = """ +Corpus-based dictionary of Thai and English languages. \ + This dataset contains frequently-used words from trusted \ + publications such as novels, academic documents and newspaper. \ + The dataset link contains Thai-English and English-Thai lexicons. \ + Thai-English vocabulary consists of vocabulary, type of word \ + (part of speech), translation, synonym (synonym) and sample sentences \ + with a list of Thai-> English words, 53,000 words and English vocabulary \ + list -> Thai, 83,000 words. +""" + +_HOMEPAGE = "https://opend-portal.nectec.or.th/dataset/lexitron-2-0" + +_LANGUAGES = ["tha"] + +_LICENSE = Licenses.OTHERS.value + +_LOCAL = False + +_URLS = { + "telex": "https://opend-portal.nectec.or.th/dataset/bdd85296-9398-499f-b3a7-aab85042d3f9/resource/761924ea-937f-4be3-afe1-c031c754fa39/download/lexitron_2.0.zip", + "etlex": "https://opend-portal.nectec.or.th/dataset/bdd85296-9398-499f-b3a7-aab85042d3f9/resource/761924ea-937f-4be3-afe1-c031c754fa39/download/lexitron_2.0.zip", +} + +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +class LEXiTRONDataset(datasets.GeneratorBasedBuilder): + """ + Corpus-based dictionary of Thai and English languages. \ + This dataset contains frequently-used words from trusted \ + publications such as novels, academic documents and newspaper. \ + The dataset link contains Thai-English and English-Thai lexicons. \ + Thai-English vocabulary consists of vocabulary, type of word \ + (part of speech), translation, synonym (synonym) and sample sentences \ + with a list of Thai-> English words, 53,000 words and English vocabulary \ + list -> Thai, 83,000 words. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + SEACROWD_SCHEMA_NAME = "t2t" + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_telex_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_telex", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_telex_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_telex", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_etlex_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}_etlex", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_etlex_seacrowd_{SEACROWD_SCHEMA_NAME}", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema=f"seacrowd_{SEACROWD_SCHEMA_NAME}", + subset_id=f"{_DATASETNAME}_etlex", + ), + ] + + DEFAULT_CONFIG_NAME = "[dataset_name]_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + + translation_type = self.config.name.split("_")[1] + + if translation_type == "telex": + features = datasets.Features( + { + "id": datasets.Value("int64"), + "tsearch": datasets.Value("string"), + "tentry": datasets.Value("string"), + "eentry": datasets.Value("string"), + "tcat": datasets.Value("string"), + "tsyn": datasets.Value("string"), + "tsample": datasets.Value("string"), + "tdef": datasets.Value("string"), + } + ) + + elif translation_type == "etlex": + features = datasets.Features( + {"id": datasets.Value("int64"), "esearch": datasets.Value("string"), "eentry": datasets.Value("string"), "tentry": datasets.Value("string"), "ecat": datasets.Value("string"), "esyn": datasets.Value("string")} + ) + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + translation_type = self.config.name.split("_")[1] + data_dir = dl_manager.download_and_extract(_URLS[translation_type]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(data_dir, f"LEXiTRON_2.0/{translation_type}"), + "split": "train", + }, + ) + ] + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + translation_type = self.config.name.split("_")[1] + + if translation_type == "telex": + + with open(filepath, "r", encoding="latin-1") as file: + data = file.read() + + pattern = r"(.*?)" + docs = re.findall(pattern, data, re.DOTALL) + + doc_data = [] + + for doc in docs: + tsearch = tentry = eentry = tcat = tsyn = tsample = tdef = id = None + + tsearch_match = re.search(r"(.*?)", doc) + if tsearch_match: + tsearch = tsearch_match.group(1) + + tentry_match = re.search(r"(.*?)", doc) + if tentry_match: + tentry = tentry_match.group(1) + + eentry_match = re.search(r"(.*?)", doc) + if eentry_match: + eentry = eentry_match.group(1) + + tcat_match = re.search(r"(.*?)", doc) + if tcat_match: + tcat = tcat_match.group(1) + + tsyn_match = re.search(r"(.*?)", doc) + if tsyn_match: + tsyn = tsyn_match.group(1) + + tsample_match = re.search(r"(.*?)", doc) + if tsample_match: + tsample = tsample_match.group(1) + + tdef_match = re.search(r"(.*?)", doc) + if tdef_match: + tdef = tdef_match.group(1) + + id_match = re.search(r"(.*?)", doc) + if id_match: + id = id_match.group(1) + + doc_data.append({"id": id, "tsearch": tsearch, "tentry": tentry, "eentry": eentry, "tcat": tcat, "tsyn": tsyn, "tsample": tsample, "tdef": tdef}) + + df = pd.DataFrame(doc_data) + + if translation_type == "etlex": + + with open(filepath, "r", encoding="latin-1") as file: + data = file.read() + + pattern = r"(.*?)" + docs = re.findall(pattern, data, re.DOTALL) + + doc_data = [] + + for doc in docs: + esearch = eentry = tentry = ecat = esyn = id = None + + esearch_match = re.search(r"(.*?)", doc) + if esearch_match: + esearch = esearch_match.group(1) + + eentry_match = re.search(r"(.*?)", doc) + if eentry_match: + eentry = eentry_match.group(1) + + tentry_match = re.search(r"(.*?)", doc) + if tentry_match: + tentry = tentry_match.group(1) + + ecat_match = re.search(r"(.*?)", doc) + if ecat_match: + ecat = ecat_match.group(1) + + esyn_match = re.search(r"(.*?)", doc) + if esyn_match: + esyn = esyn_match.group(1) + + id_match = re.search(r"(.*?)", doc) + if id_match: + id = id_match.group(1) + + doc_data.append({"id": id, "esearch": esearch, "eentry": eentry, "tentry": tentry, "ecat": ecat, "esyn": esyn}) + + df = pd.DataFrame(doc_data) + + for index, row in df.iterrows(): + + if self.config.schema == "source": + example = row.to_dict() + + elif self.config.schema == f"seacrowd_{self.SEACROWD_SCHEMA_NAME}": + + if translation_type == "telex": + example = { + "id": str(index), + "text_1": str(row["tentry"]), + "text_2": str(row["eentry"]), + "text_1_name": "tentry", + "text_2_name": "eentry", + } + + if translation_type == "etlex": + example = { + "id": str(index), + "text_1": str(row["eentry"]), + "text_2": str(row["tentry"]), + "text_1_name": "eentry", + "text_2_name": "tentry", + } + + yield index, example