SEACrowd · patrickamadeus · Apr 9, 2024 · May 2, 2024 · May 13, 2024 · May 13, 2024
@@ -0,0 +1,177 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Onto4All is a subsample of other open source performant conversational datasets. We start with a carefully curated subset of the OpenHermes-2.5-Viet dataset, co-created by @qnguyen3 and @Teknium. This dataset is specifically designed to support the training and evaluation of Multilingual language models, such as Vistral-7B-chat and VinaLlama-7B-chat, and is derived from our Supervised Fine-Tuning (SFT) data. We have included Vietnamese here, but will add more languages.
+"""
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+import datasets
+import pandas as pd
+
+from seacrowd.utils import schemas
+from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Tasks, Licenses
+
+_CITATION = """\
+@article{Onto4All2024,
+  title={Onto4All: Enhancing Multilingual Conversational AI},
+  author={Nguyen, Q., },
+  journal={GitHub repository},
+  year={2024},
+  publisher={HuggingFace Datasets}
+}
+"""
+
+_DATASETNAME = "onto4all"
+
+_DESCRIPTION = """\
+Onto4All is a subsample of other open source performant conversational datasets. We start with a carefully curated subset of the OpenHermes-2.5-Viet dataset, co-created by @qnguyen3 and @Teknium. This dataset is specifically designed to support the training and evaluation of Multilingual language models, such as Vistral-7B-chat and VinaLlama-7B-chat, and is derived from our Supervised Fine-Tuning (SFT) data. We have included Vietnamese here, but will add more languages.
+"""
+
+_HOMEPAGE = "https://huggingface.co/datasets/ontocord/onto4all"
+
+_LANGUAGES = ["vie"] 
+
+_LICENSE = Licenses.CC0_1_0.value
+
+_LOCAL = False
+
+_URLS = "https://huggingface.co/datasets/ontocord/onto4all/resolve/main/data/train-00000-of-00001.parquet?download=true"
+
+_SUPPORTED_TASKS = [Tasks.MULTI_TURN_CONVERSATION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_SEACROWD_VERSION = "1.0.0"
+
+class Onto4AllDataset(datasets.GeneratorBasedBuilder):
+    """Onto4All is a subsample of other open source performant conversational datasets. We start with a carefully curated subset of the OpenHermes-2.5-Viet dataset, co-created by @qnguyen3 and @Teknium. This dataset is specifically designed to support the training and evaluation of Multilingual language models, such as Vistral-7B-chat and VinaLlama-7B-chat, and is derived from our Supervised Fine-Tuning (SFT) data. We have included Vietnamese here, but will add more languages."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION)
+
+    BUILDER_CONFIGS = [
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_source",
+            version=SOURCE_VERSION,
+            description=f"{_DATASETNAME} source schema",
+            schema="source",
+            subset_id=f"{_DATASETNAME}",
+        ),
+        SEACrowdConfig(
+            name=f"{_DATASETNAME}_seacrowd_chat",
+            version=SEACROWD_VERSION,
+            description=f"{_DATASETNAME} SEACrowd schema",
+            schema="seacrowd_chat",
+            subset_id=f"{_DATASETNAME}",
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("int32"),
+                    "type": datasets.Value("string"),
+                    "conversation": datasets.Sequence({
+                        "from": datasets.Value("string"),
+                        "value": datasets.Value("string"),
+                        "weight": datasets.Value("int32"),
+                    })
+                }
+            )
+
+        elif self.config.schema == "seacrowd_chat":
+            features = schemas.chat_features
+            features["meta"] = {
+                "type": datasets.Value("string")
+            }
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        data_dir = dl_manager.download_and_extract(_URLS)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": data_dir,
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        df = pd.read_parquet(filepath)
+
+        if self.config.schema == "source":
+            for i, row in df.iterrows():
+                conversation = [{
+                        "from": item["from"],
+                        "value": item["value"],
+                        "weight": item["weight"],
+                    } for item in row["conversation"]
+                ]
+
+                yield i, {
+                    "id": row["id"],
+                    "type": row["type"],
+                    "conversation": conversation,
+                }
+
+        elif self.config.schema == "seacrowd_chat":
+            for i, row in df.iterrows():
+                context = ""
+                question = ""
+                answer = ""
+
+                for item in row["conversation"]:
+                    if item["from"] == "system":
+                        context = item["value"]
+                    elif item["from"] == "human":
+                        question = item["value"]
+                    elif item["from"] == "gpt":
+                        answer = item["value"]
+
+                yield i, {
+                    "id": row["id"],
+                    "input": [
+                        {
+                            "role": "system",
+                            "content": context,
+                        },
+                        {
+                            "role": "user",
+                            "content": question,
+                        },
+                    ],
+                    "output": answer,
+                    "meta": {
+                        "type": row["type"],
+                    },
+                }
@@ -10,6 +10,7 @@
     pairs_features_score,
     pairs_multi_features,
     qa_features,
+    chat_features,
     image_features,
     image_multi_features,
     imqa_features,
@@ -105,6 +106,7 @@ class Tasks(Enum):
     # Multi Text Generation
     DIALOGUE_SYSTEM = "DS"
     E2E_TASK_ORIENTED_DIALOGUE = "TOD"
+    MULTI_TURN_CONVERSATION = "MTC"
 
     # Self Supervised & Unsupervised Text
     PROMPTING = "PRT"
@@ -246,6 +248,7 @@ class Licenses(Enum):
     Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL",
     Tasks.COMMONSENSE_REASONING: "QA",
     Tasks.QUESTION_ANSWERING: "QA",
+    Tasks.MULTI_TURN_CONVERSATION: "CHAT",
     Tasks.CONCEPT_ALIGNMENT_CLASSIFICATION: "PAIRS",
     Tasks.NEXT_SENTENCE_PREDICTION: "PAIRS",
     Tasks.TEXT_RETRIEVAL: "PAIRS",
@@ -313,6 +316,7 @@ class Licenses(Enum):
     "KB": kb_features,
     "TREE": tree_features,
     "QA": qa_features,
+    "CHAT": chat_features,
     "T2T": text2text_features,
     "TEXT": text_features(),
     "TEXT_MULTI": text_multi_features(),

@@ -5,6 +5,7 @@
 from .pairs import features_with_continuous_label as pairs_features_score
 from .pairs_multilabel import features as pairs_multi_features
 from .qa import features as qa_features
+from .chat import features as chat_features
 from .image import features as image_features
 from .image import multi_features as image_multi_features
 from .imqa import features as imqa_features
@@ -28,6 +29,7 @@
     "pairs_features_score",
     "pairs_multi_features",
     "qa_features",
+    "chat_features",
     "image_features",
     "image_multi_features",
     "imqa_features",

@@ -0,0 +1,24 @@
+"""
+Conversational Chat Schema
+"""
+import datasets
+
+features = datasets.Features(
+    {
+        "id": datasets.Value("string"),
+        "input": datasets.Sequence({
+            "role": datasets.ClassLabel(names=["system", "user", "assistant"]),
+            "content": datasets.Value("string"),
+        }),
+        "output": datasets.Value("string"),
+
+        # the schema of 'meta' aren't specified either to allow some flexibility
+        "meta": {}
+
+        # notes on how to use this field of 'meta'
+        # you can choose two of options:
+        # 1. defining as empty dict if you don't think it's usable in `_generate_examples`, or
+        # 2. defining meta as dict of key with intended colname meta and its val with dataset.Features class
+        #    in `_info` Dataloader method then populate it with the values in `_general_examples` Dataloader method
+    }
+)