From e0082f169a54719f65099bdf2f46144e7adff5f6 Mon Sep 17 00:00:00 2001 From: bryanwilie Date: Sun, 28 Jan 2024 00:11:52 +0800 Subject: [PATCH 1/5] add malindo_parallel.py --- .../malindo_parallel/malindo_parallel.py | 297 ++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py new file mode 100644 index 000000000..7be7097d9 --- /dev/null +++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py @@ -0,0 +1,297 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This template serves as a starting point for contributing a dataset to the SEACrowd Datahub repo. + +When modifying it for your dataset, look for TODO items that offer specific instructions. + +Full documentation on writing dataset loading scripts can be found here: +https://huggingface.co/docs/datasets/add_dataset.html + +To create a dataset loading script you will create a class and implement 3 methods: + * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object. + * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. + * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. + +TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. +""" +import os +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME + + +# TODO: Add BibTeX citation +_CITATION = """\ +@misc{MALINDO-parallel, + title = "MALINDO-parallel", + howpublished = "https://github.com/matbahasa/MALINDO_Parallel/blob/master/README.md", + note = "Accessed: 2023-01-27", +} +""" + +# TODO: create a module level variable with your dataset name (should match script name) +# E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer +_DATASETNAME = "malindo_parallel" + +# TODO: Add description of the dataset here +# You can copy an official description +_DESCRIPTION = """\ +Teks ini adalah skrip video untuk Kampus Terbuka Universiti Bahasa Asing Tokyo pada tahun 2020. Tersedia parallel sentences dalam Bahasa Melayu/Indonesia dan Bahasa Jepang +""" + +# TODO: Add a link to an official homepage for the dataset here (if possible) +_HOMEPAGE = "https://github.com/matbahasa/MALINDO_Parallel/tree/master/OpenCampusTUFS" + +# TODO: Add languages related to this dataset +_LANGUAGES = ["zlm", "jpn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) + +# TODO: Add the licence for the dataset here +# Note that this doesn't have to be a common open source license. +# In the case of the dataset intentionally is built without license, please use `Licenses.UNLICENSE.value` +# In the case that it's not clear whether the dataset has a license or not, please use `Licenses.UNKNOWN.value` +# Some datasets may also have custom licenses. In this case, simply put f'{Licenses.OTHERS.value} | {FULL_LICENSE_TERM}' into `_LICENSE` +_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + +# TODO: Add a _LOCAL flag to indicate whether the data cannot be sourced from a public link +# E.g. the dataset requires signing a specific term of use, the dataset is sent through email, etc. +_LOCAL = False + +# TODO: Add links to the urls needed to download your dataset files. +# For local datasets, this variable can be an empty dictionary. + +# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. +# In most cases the URLs will be the same for the source and seacrowd config. +# However, if you need to access different files for each config you can have multiple entries in this dict. +# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) +_URLS = { + _DATASETNAME: "https://github.com/matbahasa/MALINDO_Parallel/blob/master/OpenCampusTUFS/OCTUFS2020.txt", +} + +# TODO: add supported task by dataset. One dataset may support multiple tasks +_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" +# This version doesn't have to be consistent with semantic versioning. Anything that is +# provided by the original dataset as a version goes. +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + + +# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case +class MalindoParallel(datasets.GeneratorBasedBuilder): + """Data terjemahan bahasa Melayu/Indonesia""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + # You will be able to load the "source" or "seacrowd" configurations with + # ds_source = datasets.load_dataset('my_dataset', name='source') + # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd') + + # For local datasets you can make use of the `data_dir` and `data_files` kwargs + # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits + # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") + # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd', data_dir="/path/to/data/files") + + # TODO: For each dataset, implement Config for Source and SEACrowd; + # If dataset contains more than one subset (see seacrowd/sea_datasets/smsa.py) implement for EACH of them. + # Each of them should contain: + # - name: should be unique for each dataset config eg. smsa_(source|seacrowd)_[seacrowd_schema_name] + # - version: option = (SOURCE_VERSION|SEACROWD_VERSION) + # - description: one line description for the dataset + # - schema: options = (source|seacrowd_[seacrowd_schema_name]) + # - subset_id: subset id is the canonical name for the dataset (eg. smsa) + # where [seacrowd_schema_name] can be checked in seacrowd/utils/constants.py + # under variable `TASK_TO_SCHEMA`, in accordance to values from `_SUPPORTED_TASKS` + # for all config(s) defined + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name="malindo_parallel_source", + version=SOURCE_VERSION, + description="malindo_parallel source schema", + schema="source", + subset_id="malindo_parallel", + ), + SEACrowdConfig( + name="malindo_parallel_seacrowd_t2t", + version=SEACROWD_VERSION, + description="malindo_parallel SEACrowd schema", + schema="seacrowd_t2t", + subset_id="malindo_parallel", + ), + ] + + DEFAULT_CONFIG_NAME = "malindo_parallel_source" + + def _info(self) -> datasets.DatasetInfo: + + # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. + + # You can arbitrarily nest lists and dictionaries. + # For iterables, use lists over tuples or `datasets.Sequence` + + if self.config.schema == "source": + # TODO: Create your source schema here + features = datasets.Features( + { + "id": datasets.Value("string"), + "text": datasets.Value("string") + }) + # raise NotImplementedError() + + # EX: Arbitrary NER type dataset + # features = datasets.Features( + # { + # "doc_id": datasets.Value("string"), + # "text": datasets.Value("string"), + # "entities": [ + # { + # "offsets": [datasets.Value("int64")], + # "text": datasets.Value("string"), + # "type": datasets.Value("string"), + # "entity_id": datasets.Value("string"), + # } + # ], + # } + # ) + + # Choose the appropriate seacrowd schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. + + # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple seacrowd configs with a seacrowd_[seacrowd_schema_name] format. + + # For example seacrowd_kb, seacrowd_t2t + elif self.config.schema == "seacrowd_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + + # If you need to access the "source" or "seacrowd" config choice, that will be in self.config.name + + # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath + + # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager + + # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. + + # TODO: KEEP if your dataset is PUBLIC; remove if not + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + # # TODO: KEEP if your dataset is LOCAL; remove if NOT + # if self.config.data_dir is None: + # raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") + # else: + # data_dir = self.config.data_dir + + # Not all datasets have predefined canonical train/val/test splits. + # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": data_dir, + "split": "train", + }, + ), + ] + + # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` + + # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. + + def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: +# """Yields examples as (key, example) tuples.""" +# # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. + +# # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. + +# # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files + + data = json.load(open(filepath, "r")) + # with open(filepath) as f: + # file = f.read() + # data = json.loads(file) + rows = data['payload']['blob']['rawLines'] + + if self.config.schema == "source": + # TODO: yield (key, example) tuples in the original dataset schema + # for key, example in thing: + # yield key, example + + for i, row in enumerate(rows): + t1idx = row.find('\t')+1 + t2idx = row[t1idx:].find('\t') + row_id = row[:t1idx] + row_melayu = row[t1idx:t1idx+t2idx] + row_japanese = row[t1idx+t2idx+1:-1] + ex = { + "id": i, + "text": row_melayu+'\t'+row_japanese + } + yield i, ex + + elif self.config.schema == "seacrowd_t2t": + # TODO: yield (key, example) tuples in the seacrowd schema + # for key, example in thing: + # yield key, example + + for i, row in enumerate(rows): + t1idx = row.find('\t')+1 + t2idx = row[t1idx:].find('\t') + row_id = row[:t1idx] + row_melayu = row[t1idx:t1idx+t2idx] + row_japanese = row[t1idx+t2idx+1:-1] + ex = { + "id": i, + "text_1": row_melayu, + "text_2": row_japanese, + "text_1_name": "zlm", + "text_2_name": "jpn", + } + yield i, ex + + +# This template is based on the following template from the datasets package: +# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py + + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__) From d8a04971a3c98470fa766e27803dc44a4bb88935 Mon Sep 17 00:00:00 2001 From: bryanwilie Date: Sun, 4 Feb 2024 01:10:34 +0800 Subject: [PATCH 2/5] cleanup --- .../sea_datasets/malindo_parallel/__init__.py | 0 .../malindo_parallel/malindo_parallel.py | 174 +++--------------- 2 files changed, 29 insertions(+), 145 deletions(-) create mode 100644 seacrowd/sea_datasets/malindo_parallel/__init__.py diff --git a/seacrowd/sea_datasets/malindo_parallel/__init__.py b/seacrowd/sea_datasets/malindo_parallel/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py index 7be7097d9..c5b14f780 100644 --- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py +++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py @@ -16,7 +16,6 @@ """ This template serves as a starting point for contributing a dataset to the SEACrowd Datahub repo. -When modifying it for your dataset, look for TODO items that offer specific instructions. Full documentation on writing dataset loading scripts can be found here: https://huggingface.co/docs/datasets/add_dataset.html @@ -26,10 +25,9 @@ * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split. * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`. -TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset. """ -import os import json +import os from pathlib import Path from typing import Dict, List, Tuple @@ -37,10 +35,9 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, + DEFAULT_SOURCE_VIEW_NAME, Tasks) - -# TODO: Add BibTeX citation _CITATION = """\ @misc{MALINDO-parallel, title = "MALINDO-parallel", @@ -49,82 +46,46 @@ } """ -# TODO: create a module level variable with your dataset name (should match script name) -# E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer _DATASETNAME = "malindo_parallel" -# TODO: Add description of the dataset here -# You can copy an official description + _DESCRIPTION = """\ Teks ini adalah skrip video untuk Kampus Terbuka Universiti Bahasa Asing Tokyo pada tahun 2020. Tersedia parallel sentences dalam Bahasa Melayu/Indonesia dan Bahasa Jepang """ -# TODO: Add a link to an official homepage for the dataset here (if possible) + _HOMEPAGE = "https://github.com/matbahasa/MALINDO_Parallel/tree/master/OpenCampusTUFS" -# TODO: Add languages related to this dataset + _LANGUAGES = ["zlm", "jpn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) -# TODO: Add the licence for the dataset here -# Note that this doesn't have to be a common open source license. -# In the case of the dataset intentionally is built without license, please use `Licenses.UNLICENSE.value` -# In the case that it's not clear whether the dataset has a license or not, please use `Licenses.UNKNOWN.value` -# Some datasets may also have custom licenses. In this case, simply put f'{Licenses.OTHERS.value} | {FULL_LICENSE_TERM}' into `_LICENSE` -_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value -# TODO: Add a _LOCAL flag to indicate whether the data cannot be sourced from a public link -# E.g. the dataset requires signing a specific term of use, the dataset is sent through email, etc. +_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value + + _LOCAL = False -# TODO: Add links to the urls needed to download your dataset files. -# For local datasets, this variable can be an empty dictionary. -# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators. -# In most cases the URLs will be the same for the source and seacrowd config. -# However, if you need to access different files for each config you can have multiple entries in this dict. -# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method) _URLS = { _DATASETNAME: "https://github.com/matbahasa/MALINDO_Parallel/blob/master/OpenCampusTUFS/OCTUFS2020.txt", } -# TODO: add supported task by dataset. One dataset may support multiple tasks + _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] -# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0" -# This version doesn't have to be consistent with semantic versioning. Anything that is -# provided by the original dataset as a version goes. + _SOURCE_VERSION = "1.0.0" _SEACROWD_VERSION = "1.0.0" -# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case + class MalindoParallel(datasets.GeneratorBasedBuilder): """Data terjemahan bahasa Melayu/Indonesia""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) - # You will be able to load the "source" or "seacrowd" configurations with - # ds_source = datasets.load_dataset('my_dataset', name='source') - # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd') - - # For local datasets you can make use of the `data_dir` and `data_files` kwargs - # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits - # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") - # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd', data_dir="/path/to/data/files") - - # TODO: For each dataset, implement Config for Source and SEACrowd; - # If dataset contains more than one subset (see seacrowd/sea_datasets/smsa.py) implement for EACH of them. - # Each of them should contain: - # - name: should be unique for each dataset config eg. smsa_(source|seacrowd)_[seacrowd_schema_name] - # - version: option = (SOURCE_VERSION|SEACROWD_VERSION) - # - description: one line description for the dataset - # - schema: options = (source|seacrowd_[seacrowd_schema_name]) - # - subset_id: subset id is the canonical name for the dataset (eg. smsa) - # where [seacrowd_schema_name] can be checked in seacrowd/utils/constants.py - # under variable `TASK_TO_SCHEMA`, in accordance to values from `_SUPPORTED_TASKS` - # for all config(s) defined BUILDER_CONFIGS = [ SEACrowdConfig( @@ -147,41 +108,9 @@ class MalindoParallel(datasets.GeneratorBasedBuilder): def _info(self) -> datasets.DatasetInfo: - # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. - - # You can arbitrarily nest lists and dictionaries. - # For iterables, use lists over tuples or `datasets.Sequence` - if self.config.schema == "source": - # TODO: Create your source schema here - features = datasets.Features( - { - "id": datasets.Value("string"), - "text": datasets.Value("string") - }) - # raise NotImplementedError() - - # EX: Arbitrary NER type dataset - # features = datasets.Features( - # { - # "doc_id": datasets.Value("string"), - # "text": datasets.Value("string"), - # "entities": [ - # { - # "offsets": [datasets.Value("int64")], - # "text": datasets.Value("string"), - # "type": datasets.Value("string"), - # "entity_id": datasets.Value("string"), - # } - # ], - # } - # ) - - # Choose the appropriate seacrowd schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide. - - # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple seacrowd configs with a seacrowd_[seacrowd_schema_name] format. - - # For example seacrowd_kb, seacrowd_t2t + features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")}) + elif self.config.schema == "seacrowd_t2t": features = schemas.text2text_features @@ -195,35 +124,14 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - - # If you need to access the "source" or "seacrowd" config choice, that will be in self.config.name - - # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath - # PUBLIC DATASETS: Assign your data-dir based on the dl_manager. - - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager - - # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files. - - # TODO: KEEP if your dataset is PUBLIC; remove if not urls = _URLS[_DATASETNAME] data_dir = dl_manager.download_and_extract(urls) - - # # TODO: KEEP if your dataset is LOCAL; remove if NOT - # if self.config.data_dir is None: - # raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.") - # else: - # data_dir = self.config.data_dir - - # Not all datasets have predefined canonical train/val/test splits. - # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data. return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ "filepath": data_dir, "split": "train", @@ -231,52 +139,33 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase ), ] - # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - - # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. - def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: -# """Yields examples as (key, example) tuples.""" -# # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. - -# # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. -# # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files data = json.load(open(filepath, "r")) - # with open(filepath) as f: - # file = f.read() - # data = json.loads(file) - rows = data['payload']['blob']['rawLines'] + + rows = data["payload"]["blob"]["rawLines"] if self.config.schema == "source": - # TODO: yield (key, example) tuples in the original dataset schema - # for key, example in thing: - # yield key, example - + for i, row in enumerate(rows): - t1idx = row.find('\t')+1 - t2idx = row[t1idx:].find('\t') + t1idx = row.find("\t") + 1 + t2idx = row[t1idx:].find("\t") row_id = row[:t1idx] - row_melayu = row[t1idx:t1idx+t2idx] - row_japanese = row[t1idx+t2idx+1:-1] - ex = { - "id": i, - "text": row_melayu+'\t'+row_japanese - } + row_melayu = row[t1idx : t1idx + t2idx] + row_japanese = row[t1idx + t2idx + 1 : -1] + ex = {"id": i, "text": row_melayu + "\t" + row_japanese} yield i, ex elif self.config.schema == "seacrowd_t2t": - # TODO: yield (key, example) tuples in the seacrowd schema - # for key, example in thing: - # yield key, example - + + for i, row in enumerate(rows): - t1idx = row.find('\t')+1 - t2idx = row[t1idx:].find('\t') + t1idx = row.find("\t") + 1 + t2idx = row[t1idx:].find("\t") row_id = row[:t1idx] - row_melayu = row[t1idx:t1idx+t2idx] - row_japanese = row[t1idx+t2idx+1:-1] + row_melayu = row[t1idx : t1idx + t2idx] + row_japanese = row[t1idx + t2idx + 1 : -1] ex = { "id": i, "text_1": row_melayu, @@ -287,11 +176,6 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: yield i, ex -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py - -# This allows you to run your dataloader with `python [dataset_name].py` during development -# TODO: Remove this before making your PR if __name__ == "__main__": datasets.load_dataset(__file__) From 453434c3c8a83d9a027a34e044f24479b65a525d Mon Sep 17 00:00:00 2001 From: Bryan Wilie Date: Sun, 3 Mar 2024 17:29:02 +0800 Subject: [PATCH 3/5] Class name fix Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --- seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py index c5b14f780..7bb47f527 100644 --- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py +++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py @@ -80,7 +80,7 @@ -class MalindoParallel(datasets.GeneratorBasedBuilder): +class MalindoParallelDataset(datasets.GeneratorBasedBuilder): """Data terjemahan bahasa Melayu/Indonesia""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) From fb2c16bdcd5884bc9802060678d7874653907fb9 Mon Sep 17 00:00:00 2001 From: Bryan Wilie Date: Sun, 3 Mar 2024 17:29:38 +0800 Subject: [PATCH 4/5] Remove sample licenses Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> --- seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py index 7bb47f527..ae535592a 100644 --- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py +++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py @@ -60,7 +60,7 @@ _LANGUAGES = ["zlm", "jpn"] # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data) -_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value +_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" _LOCAL = False From 355efb1bdcbe6ad2dfad95b85f19f16b77df2eab Mon Sep 17 00:00:00 2001 From: bryanwilie Date: Mon, 1 Apr 2024 15:01:22 +0800 Subject: [PATCH 5/5] fix dataset formatting error, use original dataset id --- .../malindo_parallel/malindo_parallel.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py index ae535592a..e72951010 100644 --- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py +++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py @@ -67,7 +67,7 @@ _URLS = { - _DATASETNAME: "https://github.com/matbahasa/MALINDO_Parallel/blob/master/OpenCampusTUFS/OCTUFS2020.txt", + _DATASETNAME: "https://raw.githubusercontent.com/matbahasa/MALINDO_Parallel/master/OpenCampusTUFS/OCTUFS2020.txt", } @@ -141,10 +141,25 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: - - data = json.load(open(filepath, "r")) - - rows = data["payload"]["blob"]["rawLines"] + rows = [] + temp_cols = None + with open(filepath) as file: + while line := file.readline(): + if temp_cols is None: + cols = [] + for col in line.split('\t'): + if len(col.strip('\n'))>0: + cols.append(col) + if len(cols) > 2: + correct_line = line.rstrip() + rows.append(correct_line) + else: + temp_cols = cols + else: + temp_cols.append(line) + correct_line = "\t".join(temp_cols).rstrip() + temp_cols = None + rows.append(correct_line) if self.config.schema == "source": @@ -154,12 +169,12 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: row_id = row[:t1idx] row_melayu = row[t1idx : t1idx + t2idx] row_japanese = row[t1idx + t2idx + 1 : -1] - ex = {"id": i, "text": row_melayu + "\t" + row_japanese} + ex = {"id": row_id.rstrip(), + "text": row_melayu + "\t" + row_japanese} yield i, ex elif self.config.schema == "seacrowd_t2t": - for i, row in enumerate(rows): t1idx = row.find("\t") + 1 t2idx = row[t1idx:].find("\t") @@ -167,7 +182,7 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]: row_melayu = row[t1idx : t1idx + t2idx] row_japanese = row[t1idx + t2idx + 1 : -1] ex = { - "id": i, + "id": row_id.rstrip(), "text_1": row_melayu, "text_2": row_japanese, "text_1_name": "zlm",