From e0082f169a54719f65099bdf2f46144e7adff5f6 Mon Sep 17 00:00:00 2001
From: bryanwilie <bryanwilie9292@gmail.com>
Date: Sun, 28 Jan 2024 00:11:52 +0800
Subject: [PATCH 1/5] add malindo_parallel.py

---
 .../malindo_parallel/malindo_parallel.py      | 297 ++++++++++++++++++
 1 file changed, 297 insertions(+)
 create mode 100644 seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py

diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
new file mode 100644
index 000000000..7be7097d9
--- /dev/null
+++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
@@ -0,0 +1,297 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This template serves as a starting point for contributing a dataset to the SEACrowd Datahub repo.
+
+When modifying it for your dataset, look for TODO items that offer specific instructions.
+
+Full documentation on writing dataset loading scripts can be found here:
+https://huggingface.co/docs/datasets/add_dataset.html
+
+To create a dataset loading script you will create a class and implement 3 methods:
+  * `_info`: Establishes the schema for the dataset, and returns a datasets.DatasetInfo object.
+  * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
+  * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
+
+TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
+"""
+import os
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
+
+
+# TODO: Add BibTeX citation
+_CITATION = """\
+@misc{MALINDO-parallel,
+  title = "MALINDO-parallel",
+  howpublished = "https://github.com/matbahasa/MALINDO_Parallel/blob/master/README.md",
+  note = "Accessed: 2023-01-27",
+}
+"""
+
+# TODO: create a module level variable with your dataset name (should match script name)
+#  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
+_DATASETNAME = "malindo_parallel"
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+Teks ini adalah skrip video untuk Kampus Terbuka Universiti Bahasa Asing Tokyo pada tahun 2020. Tersedia parallel sentences dalam Bahasa Melayu/Indonesia dan Bahasa Jepang
+"""
+
+# TODO: Add a link to an official homepage for the dataset here (if possible)
+_HOMEPAGE = "https://github.com/matbahasa/MALINDO_Parallel/tree/master/OpenCampusTUFS"
+
+# TODO: Add languages related to this dataset
+_LANGUAGES = ["zlm", "jpn"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
+
+# TODO: Add the licence for the dataset here 
+# Note that this doesn't have to be a common open source license.
+# In the case of the dataset intentionally is built without license, please use `Licenses.UNLICENSE.value`
+# In the case that it's not clear whether the dataset has a license or not, please use `Licenses.UNKNOWN.value`
+# Some datasets may also have custom licenses. In this case, simply put f'{Licenses.OTHERS.value} | {FULL_LICENSE_TERM}' into `_LICENSE`
+_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
+
+# TODO: Add a _LOCAL flag to indicate whether the data cannot be sourced from a public link 
+#  E.g. the dataset requires signing a specific term of use, the dataset is sent through email, etc.
+_LOCAL = False
+
+# TODO: Add links to the urls needed to download your dataset files.
+#  For local datasets, this variable can be an empty dictionary.
+
+# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
+# In most cases the URLs will be the same for the source and seacrowd config.
+# However, if you need to access different files for each config you can have multiple entries in this dict.
+# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
+_URLS = {
+    _DATASETNAME: "https://github.com/matbahasa/MALINDO_Parallel/blob/master/OpenCampusTUFS/OCTUFS2020.txt",
+}
+
+# TODO: add supported task by dataset. One dataset may support multiple tasks
+_SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
+
+# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
+#  This version doesn't have to be consistent with semantic versioning. Anything that is
+#  provided by the original dataset as a version goes.
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+
+# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+class MalindoParallel(datasets.GeneratorBasedBuilder):
+    """Data terjemahan bahasa Melayu/Indonesia"""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    # You will be able to load the "source" or "seacrowd" configurations with
+    # ds_source = datasets.load_dataset('my_dataset', name='source')
+    # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd')
+
+    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
+    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
+    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
+    # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd', data_dir="/path/to/data/files")
+
+    # TODO: For each dataset, implement Config for Source and SEACrowd;
+    #  If dataset contains more than one subset (see seacrowd/sea_datasets/smsa.py) implement for EACH of them.
+    #  Each of them should contain:
+    #   - name: should be unique for each dataset config eg. smsa_(source|seacrowd)_[seacrowd_schema_name]
+    #   - version: option = (SOURCE_VERSION|SEACROWD_VERSION)
+    #   - description: one line description for the dataset
+    #   - schema: options = (source|seacrowd_[seacrowd_schema_name])
+    #   - subset_id: subset id is the canonical name for the dataset (eg. smsa)
+    #  where [seacrowd_schema_name] can be checked in seacrowd/utils/constants.py
+    #    under variable `TASK_TO_SCHEMA`, in accordance to values from `_SUPPORTED_TASKS`
+    #    for all config(s) defined
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name="malindo_parallel_source",
+            version=SOURCE_VERSION,
+            description="malindo_parallel source schema",
+            schema="source",
+            subset_id="malindo_parallel",
+        ),
+        SEACrowdConfig(
+            name="malindo_parallel_seacrowd_t2t",
+            version=SEACROWD_VERSION,
+            description="malindo_parallel SEACrowd schema",
+            schema="seacrowd_t2t",
+            subset_id="malindo_parallel",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "malindo_parallel_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
+    
+        # You can arbitrarily nest lists and dictionaries.
+        # For iterables, use lists over tuples or `datasets.Sequence`
+
+        if self.config.schema == "source":
+            # TODO: Create your source schema here
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"), 
+                    "text": datasets.Value("string")
+                })
+            # raise NotImplementedError()
+
+            # EX: Arbitrary NER type dataset
+            # features = datasets.Features(
+            #    {
+            #        "doc_id": datasets.Value("string"),
+            #        "text": datasets.Value("string"),
+            #        "entities": [
+            #            {
+            #                "offsets": [datasets.Value("int64")],
+            #                "text": datasets.Value("string"),
+            #                "type": datasets.Value("string"),
+            #                "entity_id": datasets.Value("string"),
+            #            }
+            #        ],
+            #    }
+            # )
+
+        # Choose the appropriate seacrowd schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide.
+
+        # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple seacrowd configs with a seacrowd_[seacrowd_schema_name] format.
+
+        # For example seacrowd_kb, seacrowd_t2t
+        elif self.config.schema == "seacrowd_t2t":
+            features = schemas.text2text_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+
+        # If you need to access the "source" or "seacrowd" config choice, that will be in self.config.name
+
+        # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath
+
+        # PUBLIC DATASETS: Assign your data-dir based on the dl_manager.
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
+
+        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
+
+        # TODO: KEEP if your dataset is PUBLIC; remove if not
+        urls = _URLS[_DATASETNAME]
+        data_dir = dl_manager.download_and_extract(urls)
+        
+        # # TODO: KEEP if your dataset is LOCAL; remove if NOT
+        # if self.config.data_dir is None:
+        #     raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
+        # else:
+        #     data_dir = self.config.data_dir
+
+        # Not all datasets have predefined canonical train/val/test splits.
+        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # Whatever you put in gen_kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": data_dir,
+                    "split": "train",
+                },
+            ),
+        ]
+
+    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+
+    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
+
+    def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
+#         """Yields examples as (key, example) tuples."""
+#         # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
+        
+#         # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
+
+#         # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
+
+        data = json.load(open(filepath, "r"))
+        # with open(filepath) as f: 
+        #     file = f.read() 
+        #     data = json.loads(file)
+        rows = data['payload']['blob']['rawLines']
+
+        if self.config.schema == "source":
+            # TODO: yield (key, example) tuples in the original dataset schema
+            # for key, example in thing:
+            #     yield key, example
+            
+            for i, row in enumerate(rows):
+                t1idx = row.find('\t')+1
+                t2idx = row[t1idx:].find('\t')
+                row_id = row[:t1idx]
+                row_melayu = row[t1idx:t1idx+t2idx]
+                row_japanese = row[t1idx+t2idx+1:-1]
+                ex = {
+                    "id": i,
+                    "text": row_melayu+'\t'+row_japanese
+                }
+                yield i, ex
+
+        elif self.config.schema == "seacrowd_t2t":
+            # TODO: yield (key, example) tuples in the seacrowd schema
+            # for key, example in thing:
+            #     yield key, example
+                
+            for i, row in enumerate(rows):
+                t1idx = row.find('\t')+1
+                t2idx = row[t1idx:].find('\t')
+                row_id = row[:t1idx]
+                row_melayu = row[t1idx:t1idx+t2idx]
+                row_japanese = row[t1idx+t2idx+1:-1]
+                ex = {
+                    "id": i,
+                    "text_1": row_melayu,
+                    "text_2": row_japanese,
+                    "text_1_name": "zlm",
+                    "text_2_name": "jpn",
+                }
+                yield i, ex
+
+
+# This template is based on the following template from the datasets package:
+# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
+
+
+# This allows you to run your dataloader with `python [dataset_name].py` during development
+# TODO: Remove this before making your PR
+if __name__ == "__main__":
+    datasets.load_dataset(__file__)

From d8a04971a3c98470fa766e27803dc44a4bb88935 Mon Sep 17 00:00:00 2001
From: bryanwilie <bryanwilie9292@gmail.com>
Date: Sun, 4 Feb 2024 01:10:34 +0800
Subject: [PATCH 2/5] cleanup

---
 .../sea_datasets/malindo_parallel/__init__.py |   0
 .../malindo_parallel/malindo_parallel.py      | 174 +++---------------
 2 files changed, 29 insertions(+), 145 deletions(-)
 create mode 100644 seacrowd/sea_datasets/malindo_parallel/__init__.py

diff --git a/seacrowd/sea_datasets/malindo_parallel/__init__.py b/seacrowd/sea_datasets/malindo_parallel/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
index 7be7097d9..c5b14f780 100644
--- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
+++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
@@ -16,7 +16,6 @@
 """
 This template serves as a starting point for contributing a dataset to the SEACrowd Datahub repo.
 
-When modifying it for your dataset, look for TODO items that offer specific instructions.
 
 Full documentation on writing dataset loading scripts can be found here:
 https://huggingface.co/docs/datasets/add_dataset.html
@@ -26,10 +25,9 @@
   * `_split_generators`: Downloads and extracts data for each split (e.g. train/val/test) or associate local data with each split.
   * `_generate_examples`: Creates examples from data on disk that conform to each schema defined in `_info`.
 
-TODO: Before submitting your script, delete this doc string and replace it with a description of your dataset.
 """
-import os
 import json
+import os
 from pathlib import Path
 from typing import Dict, List, Tuple
 
@@ -37,10 +35,9 @@
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
+from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME,
+                                      DEFAULT_SOURCE_VIEW_NAME, Tasks)
 
-
-# TODO: Add BibTeX citation
 _CITATION = """\
 @misc{MALINDO-parallel,
   title = "MALINDO-parallel",
@@ -49,82 +46,46 @@
 }
 """
 
-# TODO: create a module level variable with your dataset name (should match script name)
-#  E.g. Hallmarks of Cancer: [dataset_name] --> hallmarks_of_cancer
 _DATASETNAME = "malindo_parallel"
 
-# TODO: Add description of the dataset here
-# You can copy an official description
+
 _DESCRIPTION = """\
 Teks ini adalah skrip video untuk Kampus Terbuka Universiti Bahasa Asing Tokyo pada tahun 2020. Tersedia parallel sentences dalam Bahasa Melayu/Indonesia dan Bahasa Jepang
 """
 
-# TODO: Add a link to an official homepage for the dataset here (if possible)
+
 _HOMEPAGE = "https://github.com/matbahasa/MALINDO_Parallel/tree/master/OpenCampusTUFS"
 
-# TODO: Add languages related to this dataset
+
 _LANGUAGES = ["zlm", "jpn"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 
-# TODO: Add the licence for the dataset here 
-# Note that this doesn't have to be a common open source license.
-# In the case of the dataset intentionally is built without license, please use `Licenses.UNLICENSE.value`
-# In the case that it's not clear whether the dataset has a license or not, please use `Licenses.UNKNOWN.value`
-# Some datasets may also have custom licenses. In this case, simply put f'{Licenses.OTHERS.value} | {FULL_LICENSE_TERM}' into `_LICENSE`
-_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
 
-# TODO: Add a _LOCAL flag to indicate whether the data cannot be sourced from a public link 
-#  E.g. the dataset requires signing a specific term of use, the dataset is sent through email, etc.
+_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)"  # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
+
+
 _LOCAL = False
 
-# TODO: Add links to the urls needed to download your dataset files.
-#  For local datasets, this variable can be an empty dictionary.
 
-# For publicly available datasets you will most likely end up passing these URLs to dl_manager in _split_generators.
-# In most cases the URLs will be the same for the source and seacrowd config.
-# However, if you need to access different files for each config you can have multiple entries in this dict.
-# This can be an arbitrarily nested dict/list of URLs (see below in `_split_generators` method)
 _URLS = {
     _DATASETNAME: "https://github.com/matbahasa/MALINDO_Parallel/blob/master/OpenCampusTUFS/OCTUFS2020.txt",
 }
 
-# TODO: add supported task by dataset. One dataset may support multiple tasks
+
 _SUPPORTED_TASKS = [Tasks.MACHINE_TRANSLATION]  # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]
 
-# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
-#  This version doesn't have to be consistent with semantic versioning. Anything that is
-#  provided by the original dataset as a version goes.
+
 _SOURCE_VERSION = "1.0.0"
 
 _SEACROWD_VERSION = "1.0.0"
 
 
-# TODO: Name the dataset class to match the script name using CamelCase instead of snake_case
+
 class MalindoParallel(datasets.GeneratorBasedBuilder):
     """Data terjemahan bahasa Melayu/Indonesia"""
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
 
-    # You will be able to load the "source" or "seacrowd" configurations with
-    # ds_source = datasets.load_dataset('my_dataset', name='source')
-    # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd')
-
-    # For local datasets you can make use of the `data_dir` and `data_files` kwargs
-    # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits
-    # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files")
-    # ds_seacrowd = datasets.load_dataset('my_dataset', name='seacrowd', data_dir="/path/to/data/files")
-
-    # TODO: For each dataset, implement Config for Source and SEACrowd;
-    #  If dataset contains more than one subset (see seacrowd/sea_datasets/smsa.py) implement for EACH of them.
-    #  Each of them should contain:
-    #   - name: should be unique for each dataset config eg. smsa_(source|seacrowd)_[seacrowd_schema_name]
-    #   - version: option = (SOURCE_VERSION|SEACROWD_VERSION)
-    #   - description: one line description for the dataset
-    #   - schema: options = (source|seacrowd_[seacrowd_schema_name])
-    #   - subset_id: subset id is the canonical name for the dataset (eg. smsa)
-    #  where [seacrowd_schema_name] can be checked in seacrowd/utils/constants.py
-    #    under variable `TASK_TO_SCHEMA`, in accordance to values from `_SUPPORTED_TASKS`
-    #    for all config(s) defined
 
     BUILDER_CONFIGS = [
         SEACrowdConfig(
@@ -147,41 +108,9 @@ class MalindoParallel(datasets.GeneratorBasedBuilder):
 
     def _info(self) -> datasets.DatasetInfo:
 
-        # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible.
-    
-        # You can arbitrarily nest lists and dictionaries.
-        # For iterables, use lists over tuples or `datasets.Sequence`
-
         if self.config.schema == "source":
-            # TODO: Create your source schema here
-            features = datasets.Features(
-                {
-                    "id": datasets.Value("string"), 
-                    "text": datasets.Value("string")
-                })
-            # raise NotImplementedError()
-
-            # EX: Arbitrary NER type dataset
-            # features = datasets.Features(
-            #    {
-            #        "doc_id": datasets.Value("string"),
-            #        "text": datasets.Value("string"),
-            #        "entities": [
-            #            {
-            #                "offsets": [datasets.Value("int64")],
-            #                "text": datasets.Value("string"),
-            #                "type": datasets.Value("string"),
-            #                "entity_id": datasets.Value("string"),
-            #            }
-            #        ],
-            #    }
-            # )
-
-        # Choose the appropriate seacrowd schema for your task and copy it here. You can find information on the schemas in the CONTRIBUTING guide.
-
-        # In rare cases you may get a dataset that supports multiple tasks requiring multiple schemas. In that case you can define multiple seacrowd configs with a seacrowd_[seacrowd_schema_name] format.
-
-        # For example seacrowd_kb, seacrowd_t2t
+            features = datasets.Features({"id": datasets.Value("string"), "text": datasets.Value("string")})
+
         elif self.config.schema == "seacrowd_t2t":
             features = schemas.text2text_features
 
@@ -195,35 +124,14 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-
-        # If you need to access the "source" or "seacrowd" config choice, that will be in self.config.name
-
-        # LOCAL DATASETS: You do not need the dl_manager; you can ignore this argument. Make sure `gen_kwargs` in the return gets passed the right filepath
 
-        # PUBLIC DATASETS: Assign your data-dir based on the dl_manager.
-
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs; many examples use the download_and_extract method; see the DownloadManager docs here: https://huggingface.co/docs/datasets/package_reference/builder_classes.html#datasets.DownloadManager
-
-        # dl_manager can accept any type of nested list/dict and will give back the same structure with the url replaced with the path to local files.
-
-        # TODO: KEEP if your dataset is PUBLIC; remove if not
         urls = _URLS[_DATASETNAME]
         data_dir = dl_manager.download_and_extract(urls)
-        
-        # # TODO: KEEP if your dataset is LOCAL; remove if NOT
-        # if self.config.data_dir is None:
-        #     raise ValueError("This is a local dataset. Please pass the data_dir kwarg to load_dataset.")
-        # else:
-        #     data_dir = self.config.data_dir
-
-        # Not all datasets have predefined canonical train/val/test splits.
-        # If your dataset has no predefined splits, use datasets.Split.TRAIN for all of the data.
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
-                # Whatever you put in gen_kwargs will be passed to _generate_examples
+
                 gen_kwargs={
                     "filepath": data_dir,
                     "split": "train",
@@ -231,52 +139,33 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase
             ),
         ]
 
-    # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-
-    # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs.
-
     def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
-#         """Yields examples as (key, example) tuples."""
-#         # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
-        
-#         # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
 
-#         # NOTE: For local datasets you will have access to self.config.data_dir and self.config.data_files
 
         data = json.load(open(filepath, "r"))
-        # with open(filepath) as f: 
-        #     file = f.read() 
-        #     data = json.loads(file)
-        rows = data['payload']['blob']['rawLines']
+
+        rows = data["payload"]["blob"]["rawLines"]
 
         if self.config.schema == "source":
-            # TODO: yield (key, example) tuples in the original dataset schema
-            # for key, example in thing:
-            #     yield key, example
-            
+
             for i, row in enumerate(rows):
-                t1idx = row.find('\t')+1
-                t2idx = row[t1idx:].find('\t')
+                t1idx = row.find("\t") + 1
+                t2idx = row[t1idx:].find("\t")
                 row_id = row[:t1idx]
-                row_melayu = row[t1idx:t1idx+t2idx]
-                row_japanese = row[t1idx+t2idx+1:-1]
-                ex = {
-                    "id": i,
-                    "text": row_melayu+'\t'+row_japanese
-                }
+                row_melayu = row[t1idx : t1idx + t2idx]
+                row_japanese = row[t1idx + t2idx + 1 : -1]
+                ex = {"id": i, "text": row_melayu + "\t" + row_japanese}
                 yield i, ex
 
         elif self.config.schema == "seacrowd_t2t":
-            # TODO: yield (key, example) tuples in the seacrowd schema
-            # for key, example in thing:
-            #     yield key, example
-                
+
+
             for i, row in enumerate(rows):
-                t1idx = row.find('\t')+1
-                t2idx = row[t1idx:].find('\t')
+                t1idx = row.find("\t") + 1
+                t2idx = row[t1idx:].find("\t")
                 row_id = row[:t1idx]
-                row_melayu = row[t1idx:t1idx+t2idx]
-                row_japanese = row[t1idx+t2idx+1:-1]
+                row_melayu = row[t1idx : t1idx + t2idx]
+                row_japanese = row[t1idx + t2idx + 1 : -1]
                 ex = {
                     "id": i,
                     "text_1": row_melayu,
@@ -287,11 +176,6 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                 yield i, ex
 
 
-# This template is based on the following template from the datasets package:
-# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py
-
 
-# This allows you to run your dataloader with `python [dataset_name].py` during development
-# TODO: Remove this before making your PR
 if __name__ == "__main__":
     datasets.load_dataset(__file__)

From 453434c3c8a83d9a027a34e044f24479b65a525d Mon Sep 17 00:00:00 2001
From: Bryan Wilie <bryanwilie9292@gmail.com>
Date: Sun, 3 Mar 2024 17:29:02 +0800
Subject: [PATCH 3/5] Class name fix

Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
---
 seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
index c5b14f780..7bb47f527 100644
--- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
+++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
@@ -80,7 +80,7 @@
 
 
 
-class MalindoParallel(datasets.GeneratorBasedBuilder):
+class MalindoParallelDataset(datasets.GeneratorBasedBuilder):
     """Data terjemahan bahasa Melayu/Indonesia"""
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)

From fb2c16bdcd5884bc9802060678d7874653907fb9 Mon Sep 17 00:00:00 2001
From: Bryan Wilie <bryanwilie9292@gmail.com>
Date: Sun, 3 Mar 2024 17:29:38 +0800
Subject: [PATCH 4/5] Remove sample licenses

Co-authored-by: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com>
---
 seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
index 7bb47f527..ae535592a 100644
--- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
+++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
@@ -60,7 +60,7 @@
 _LANGUAGES = ["zlm", "jpn"]  # We follow ISO639-3 language code (https://iso639-3.sil.org/code_tables/639/data)
 
 
-_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)"  # example: Licenses.MIT.value, Licenses.CC_BY_NC_SA_4_0.value, Licenses.UNLICENSE.value, Licenses.UNKNOWN.value
+_LICENSE = "Creative Commons Attribution 4.0 (cc-by-4.0)" 
 
 
 _LOCAL = False

From 355efb1bdcbe6ad2dfad95b85f19f16b77df2eab Mon Sep 17 00:00:00 2001
From: bryanwilie <bryanwilie9292@gmail.com>
Date: Mon, 1 Apr 2024 15:01:22 +0800
Subject: [PATCH 5/5] fix dataset formatting error, use original dataset id

---
 .../malindo_parallel/malindo_parallel.py      | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
index ae535592a..e72951010 100644
--- a/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
+++ b/seacrowd/sea_datasets/malindo_parallel/malindo_parallel.py
@@ -67,7 +67,7 @@
 
 
 _URLS = {
-    _DATASETNAME: "https://github.com/matbahasa/MALINDO_Parallel/blob/master/OpenCampusTUFS/OCTUFS2020.txt",
+    _DATASETNAME: "https://raw.githubusercontent.com/matbahasa/MALINDO_Parallel/master/OpenCampusTUFS/OCTUFS2020.txt",
 }
 
 
@@ -141,10 +141,25 @@ def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datase
 
     def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
 
-
-        data = json.load(open(filepath, "r"))
-
-        rows = data["payload"]["blob"]["rawLines"]
+        rows = []
+        temp_cols = None
+        with open(filepath) as file:
+            while line := file.readline():
+                if temp_cols is None:
+                    cols = []
+                    for col in line.split('\t'):
+                        if len(col.strip('\n'))>0:
+                            cols.append(col)
+                    if len(cols) > 2:
+                        correct_line = line.rstrip()
+                        rows.append(correct_line)
+                    else:
+                        temp_cols = cols
+                else:
+                    temp_cols.append(line)
+                    correct_line = "\t".join(temp_cols).rstrip()
+                    temp_cols = None
+                    rows.append(correct_line)
 
         if self.config.schema == "source":
 
@@ -154,12 +169,12 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                 row_id = row[:t1idx]
                 row_melayu = row[t1idx : t1idx + t2idx]
                 row_japanese = row[t1idx + t2idx + 1 : -1]
-                ex = {"id": i, "text": row_melayu + "\t" + row_japanese}
+                ex = {"id": row_id.rstrip(),
+                      "text": row_melayu + "\t" + row_japanese}
                 yield i, ex
 
         elif self.config.schema == "seacrowd_t2t":
 
-
             for i, row in enumerate(rows):
                 t1idx = row.find("\t") + 1
                 t2idx = row[t1idx:].find("\t")
@@ -167,7 +182,7 @@ def _generate_examples(self, filepath: Path, split: str) -> Tuple[int, Dict]:
                 row_melayu = row[t1idx : t1idx + t2idx]
                 row_japanese = row[t1idx + t2idx + 1 : -1]
                 ex = {
-                    "id": i,
+                    "id": row_id.rstrip(),
                     "text_1": row_melayu,
                     "text_2": row_japanese,
                     "text_1_name": "zlm",