diff --git a/seacrowd/sea_datasets/onto4all/__init__.py b/seacrowd/sea_datasets/onto4all/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/seacrowd/sea_datasets/onto4all/onto4all.py b/seacrowd/sea_datasets/onto4all/onto4all.py new file mode 100644 index 000000000..c08c73d18 --- /dev/null +++ b/seacrowd/sea_datasets/onto4all/onto4all.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Onto4All is a subsample of other open source performant conversational datasets. We start with a carefully curated subset of the OpenHermes-2.5-Viet dataset, co-created by @qnguyen3 and @Teknium. This dataset is specifically designed to support the training and evaluation of Multilingual language models, such as Vistral-7B-chat and VinaLlama-7B-chat, and is derived from our Supervised Fine-Tuning (SFT) data. We have included Vietnamese here, but will add more languages. +""" +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets +import pandas as pd + +from seacrowd.utils import schemas +from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Tasks, Licenses + +_CITATION = """\ +@article{Onto4All2024, + title={Onto4All: Enhancing Multilingual Conversational AI}, + author={Nguyen, Q., }, + journal={GitHub repository}, + year={2024}, + publisher={HuggingFace Datasets} +} +""" + +_DATASETNAME = "onto4all" + +_DESCRIPTION = """\ +Onto4All is a subsample of other open source performant conversational datasets. We start with a carefully curated subset of the OpenHermes-2.5-Viet dataset, co-created by @qnguyen3 and @Teknium. This dataset is specifically designed to support the training and evaluation of Multilingual language models, such as Vistral-7B-chat and VinaLlama-7B-chat, and is derived from our Supervised Fine-Tuning (SFT) data. We have included Vietnamese here, but will add more languages. +""" + +_HOMEPAGE = "https://huggingface.co/datasets/ontocord/onto4all" + +_LANGUAGES = ["vie"] + +_LICENSE = Licenses.CC0_1_0.value + +_LOCAL = False + +_URLS = "https://huggingface.co/datasets/ontocord/onto4all/resolve/main/data/train-00000-of-00001.parquet?download=true" + +_SUPPORTED_TASKS = [Tasks.MULTI_TURN_CONVERSATION] + +_SOURCE_VERSION = "1.0.0" + +_SEACROWD_VERSION = "1.0.0" + +class Onto4AllDataset(datasets.GeneratorBasedBuilder): + """Onto4All is a subsample of other open source performant conversational datasets. We start with a carefully curated subset of the OpenHermes-2.5-Viet dataset, co-created by @qnguyen3 and @Teknium. This dataset is specifically designed to support the training and evaluation of Multilingual language models, such as Vistral-7B-chat and VinaLlama-7B-chat, and is derived from our Supervised Fine-Tuning (SFT) data. We have included Vietnamese here, but will add more languages.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + SEACROWD_VERSION = datasets.Version(_SEACROWD_VERSION) + + BUILDER_CONFIGS = [ + SEACrowdConfig( + name=f"{_DATASETNAME}_source", + version=SOURCE_VERSION, + description=f"{_DATASETNAME} source schema", + schema="source", + subset_id=f"{_DATASETNAME}", + ), + SEACrowdConfig( + name=f"{_DATASETNAME}_seacrowd_chat", + version=SEACROWD_VERSION, + description=f"{_DATASETNAME} SEACrowd schema", + schema="seacrowd_chat", + subset_id=f"{_DATASETNAME}", + ), + ] + + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "id": datasets.Value("int32"), + "type": datasets.Value("string"), + "conversation": datasets.Sequence({ + "from": datasets.Value("string"), + "value": datasets.Value("string"), + "weight": datasets.Value("int32"), + }) + } + ) + + elif self.config.schema == "seacrowd_chat": + features = schemas.chat_features + features["meta"] = { + "type": datasets.Value("string") + } + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + data_dir = dl_manager.download_and_extract(_URLS) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": data_dir, + }, + ), + ] + + def _generate_examples(self, filepath: Path) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + df = pd.read_parquet(filepath) + + if self.config.schema == "source": + for i, row in df.iterrows(): + conversation = [{ + "from": item["from"], + "value": item["value"], + "weight": item["weight"], + } for item in row["conversation"] + ] + + yield i, { + "id": row["id"], + "type": row["type"], + "conversation": conversation, + } + + elif self.config.schema == "seacrowd_chat": + for i, row in df.iterrows(): + context = "" + question = "" + answer = "" + + for item in row["conversation"]: + if item["from"] == "system": + context = item["value"] + elif item["from"] == "human": + question = item["value"] + elif item["from"] == "gpt": + answer = item["value"] + + yield i, { + "id": row["id"], + "input": [ + { + "role": "system", + "content": context, + }, + { + "role": "user", + "content": question, + }, + ], + "output": answer, + "meta": { + "type": row["type"], + }, + } diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index cef293257..fc4c1752b 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -10,6 +10,7 @@ pairs_features_score, pairs_multi_features, qa_features, + chat_features, image_features, image_multi_features, imqa_features, @@ -105,6 +106,7 @@ class Tasks(Enum): # Multi Text Generation DIALOGUE_SYSTEM = "DS" E2E_TASK_ORIENTED_DIALOGUE = "TOD" + MULTI_TURN_CONVERSATION = "MTC" # Self Supervised & Unsupervised Text PROMPTING = "PRT" @@ -246,6 +248,7 @@ class Licenses(Enum): Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL", Tasks.COMMONSENSE_REASONING: "QA", Tasks.QUESTION_ANSWERING: "QA", + Tasks.MULTI_TURN_CONVERSATION: "CHAT", Tasks.CONCEPT_ALIGNMENT_CLASSIFICATION: "PAIRS", Tasks.NEXT_SENTENCE_PREDICTION: "PAIRS", Tasks.TEXT_RETRIEVAL: "PAIRS", @@ -313,6 +316,7 @@ class Licenses(Enum): "KB": kb_features, "TREE": tree_features, "QA": qa_features, + "CHAT": chat_features, "T2T": text2text_features, "TEXT": text_features(), "TEXT_MULTI": text_multi_features(), diff --git a/seacrowd/utils/schemas/__init__.py b/seacrowd/utils/schemas/__init__.py index ec4c035f8..5a30ac568 100644 --- a/seacrowd/utils/schemas/__init__.py +++ b/seacrowd/utils/schemas/__init__.py @@ -5,6 +5,7 @@ from .pairs import features_with_continuous_label as pairs_features_score from .pairs_multilabel import features as pairs_multi_features from .qa import features as qa_features +from .chat import features as chat_features from .image import features as image_features from .image import multi_features as image_multi_features from .imqa import features as imqa_features @@ -28,6 +29,7 @@ "pairs_features_score", "pairs_multi_features", "qa_features", + "chat_features", "image_features", "image_multi_features", "imqa_features", diff --git a/seacrowd/utils/schemas/chat.py b/seacrowd/utils/schemas/chat.py new file mode 100644 index 000000000..f88ea8f4b --- /dev/null +++ b/seacrowd/utils/schemas/chat.py @@ -0,0 +1,24 @@ +""" +Conversational Chat Schema +""" +import datasets + +features = datasets.Features( + { + "id": datasets.Value("string"), + "input": datasets.Sequence({ + "role": datasets.ClassLabel(names=["system", "user", "assistant"]), + "content": datasets.Value("string"), + }), + "output": datasets.Value("string"), + + # the schema of 'meta' aren't specified either to allow some flexibility + "meta": {} + + # notes on how to use this field of 'meta' + # you can choose two of options: + # 1. defining as empty dict if you don't think it's usable in `_generate_examples`, or + # 2. defining meta as dict of key with intended colname meta and its val with dataset.Features class + # in `_info` Dataloader method then populate it with the values in `_general_examples` Dataloader method + } +)