diff --git a/README.md b/README.md index dd5e7724a..36e0f0f34 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Southeast Asia is home to more than 1,000 native languages. Nevertheless, Southe ### Library Installation -Find seacrowd library (v0.2.0) at https://pypi.org/project/seacrowd/. (See our release notes [here](https://github.com/SEACrowd/seacrowd-datahub/releases/tag/0.2.0).) +Find seacrowd library (v0.2.2) at https://pypi.org/project/seacrowd/. (See our release notes [here](https://github.com/SEACrowd/seacrowd-datahub/releases/tag/0.2.0).) To install SEACrowd, install the `seacrowd` package in your python environment via `pip`. diff --git a/seacrowd/sea_datasets/id_frog_story/id_frog_story.py b/seacrowd/sea_datasets/id_frog_story/id_frog_story.py index eea08a1ff..c12fa4c1b 100644 --- a/seacrowd/sea_datasets/id_frog_story/id_frog_story.py +++ b/seacrowd/sea_datasets/id_frog_story/id_frog_story.py @@ -6,7 +6,7 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks +from seacrowd.utils.constants import Tasks, Licenses _CITATION = """\ @article{FrogStorytelling, @@ -31,7 +31,7 @@ """ _HOMEPAGE = "https://github.com/matbahasa/corpus-frog-storytelling" _LANGUAGES = ["ind"] -_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)" +_LICENSE = Licenses.CC_BY_SA_4_0.value _LOCAL = False _URLS = { _DATASETNAME: "https://github.com/matbahasa/corpus-frog-storytelling/archive/refs/heads/master.zip", diff --git a/seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py b/seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py index abda20e59..ad13c8c33 100644 --- a/seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py +++ b/seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py @@ -7,7 +7,7 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, +from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, Licenses, DEFAULT_SOURCE_VIEW_NAME, Tasks) _DATASETNAME = "jv_id_tts" @@ -38,7 +38,7 @@ _HOMEPAGE = "http://openslr.org/41/" -_LICENSE = "See https://www.openslr.org/resources/41/LICENSE file for license information. Attribution-ShareAlike 4.0 (CC BY-SA 4.0)." +_LICENSE = Licenses.CC_BY_SA_4_0.value _URLs = { _DATASETNAME: { diff --git a/seacrowd/sea_datasets/ojw/ojw.py b/seacrowd/sea_datasets/ojw/ojw.py index 30fe9ba38..9bb6507bf 100644 --- a/seacrowd/sea_datasets/ojw/ojw.py +++ b/seacrowd/sea_datasets/ojw/ojw.py @@ -24,6 +24,7 @@ import pandas as pd from seacrowd.utils.configs import SEACrowdConfig +from seacrowd.utils.constants import Licenses _CITATION = """\ @inproceedings{moeljadi-aminullah-2020-building, @@ -61,7 +62,7 @@ _HOMEPAGE = "https://github.com/davidmoeljadi/OJW" -_LICENSE = "Creative Commons Attribution 4.0 International (CC BY 4.0)" +_LICENSE = Licenses.CC_BY_SA_4_0.value _URLS = { diff --git a/seacrowd/sea_datasets/titml_idn/titml_idn.py b/seacrowd/sea_datasets/titml_idn/titml_idn.py index 57ce82a9b..4c1213f38 100644 --- a/seacrowd/sea_datasets/titml_idn/titml_idn.py +++ b/seacrowd/sea_datasets/titml_idn/titml_idn.py @@ -7,7 +7,7 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME +from seacrowd.utils.constants import Licenses, Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME _DATASETNAME = "titml_idn" _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME @@ -31,7 +31,7 @@ _HOMEPAGE = "http://research.nii.ac.jp/src/en/TITML-IDN.html" -_LICENSE = "For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)." +_LICENSE = Licenses.OTHERS.value + " | For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)." _URLs = {"titml-idn": "https://huggingface.co/datasets/holylovenia/TITML-IDN/resolve/main/IndoLVCSR.zip"} diff --git a/seacrowd/sea_datasets/unimorph_id/unimorph_id.py b/seacrowd/sea_datasets/unimorph_id/unimorph_id.py index d8bfa279e..f597dd5be 100644 --- a/seacrowd/sea_datasets/unimorph_id/unimorph_id.py +++ b/seacrowd/sea_datasets/unimorph_id/unimorph_id.py @@ -20,7 +20,7 @@ from seacrowd.utils import schemas from seacrowd.utils.configs import SEACrowdConfig -from seacrowd.utils.constants import Tasks +from seacrowd.utils.constants import Tasks, Licenses _CITATION = """\ @inproceedings{pimentel-ryskina-etal-2021-sigmorphon, @@ -105,7 +105,7 @@ _HOMEPAGE = "https://github.com/unimorph/ind" -_LICENSE = "Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)" +_LICENSE = Licenses.CC_BY_SA_3_0.value _URLS = { _DATASETNAME: "https://raw.githubusercontent.com/unimorph/ind/main/ind", diff --git a/seacrowd/utils/push_to_hub.py b/seacrowd/utils/push_to_hub.py index 7f8f445d3..a0a788115 100644 --- a/seacrowd/utils/push_to_hub.py +++ b/seacrowd/utils/push_to_hub.py @@ -33,20 +33,35 @@ def construct_readme(dsetname): citation = import_from(module_path, "_CITATION") license = import_from(module_path, "_LICENSE") - languages_part = "\n- " + "\n- ".join([lang for lang in languages if len(lang) <= 3]) - pretty_name_part = dset_name.replace("_", " ").title() - task_categories_part = "\n- " + "\n- ".join(task.name.replace("_", "-").lower() for task in supported_tasks) + readme_string = "\n---" if "(" in license and ")" in license: license_part = license[license.find("(")+1:license.find(")")] - readme_string = f'\n---\nlicense: {license_part}\nlanguage: {languages_part}\npretty_name: {pretty_name_part}\ntask_categories: {task_categories_part}\ntags: {task_categories_part}\n---\n' - else: - readme_string = f'\n---\nlanguage: {languages_part}\npretty_name: {pretty_name_part}\ntask_categories: {task_categories_part}\ntags: {task_categories_part}\n---\n' - readme_string += f'\n\n# {pretty_name_part}' + if license_part == "others": + license_part = "other" + readme_string += f'\nlicense: {license_part}' + + languages_part = "\n- " + "\n- ".join([lang for lang in languages if len(lang) <= 3]) + readme_string += f'\nlanguage: {languages_part}' + + pretty_name_part = dset_name.replace("_", " ").title() + readme_string += f'\npretty_name: {pretty_name_part}' + + tasks = [task.name.replace("_", "-").lower() for task in supported_tasks] + if len(tasks) > 0: + task_categories_part = "\n- " + "\n- ".join(tasks) + readme_string += f'\ntask_categories: {task_categories_part}' + readme_string += f'\ntags: {task_categories_part}' + + readme_string += '\n---' + readme_string += f'\n\n{description}' if is_local: readme_string += "\n\nThis is a local dataset. You have to obtain this dataset separately from [{homepage}]({homepage}) to use this dataloader." + readme_string += f'\n\n## Languages\n\n{", ".join(languages)}' + readme_string += f'\n\n## Supported Tasks\n\n{", ".join([str(task.name.replace("_", " ").title()) for task in supported_tasks])}' + readme_string += f''' \n## Dataset Usage ### Using `datasets` library @@ -72,7 +87,7 @@ def construct_readme(dsetname): readme_string += f'\n\n## Citation\n\nIf you are using the **{dset_name.replace("_", " ").title()}** dataloader in your work, please cite the following:' readme_string = re.sub(r"( )+\#", "#", readme_string) readme_string = re.sub(r"( )+\`\`\`", "```", readme_string) - readme_string = re.sub(r"( ){2, 4}", "", readme_string) + readme_string = re.sub(r"[ \t]{2,}", "", readme_string) readme_string += f'\n```\n{citation}\n{_SEACROWD_CITATION}\n```' readme_string = re.sub(r"( )+\@", "@", readme_string) return readme_string @@ -85,49 +100,52 @@ def construct_readme(dsetname): requirements_file = BytesIO(str.encode("seacrowd>=0.2.0")) - # for dirname in ["indolem_sentiment"]: for i, dirname in enumerate(os.listdir(_SEA_DATASETS_PATH)): - if not os.path.isdir(f"{_SEA_DATASETS_PATH}/{dirname}/"): - print(f"{dirname} is not a directory.") + if not os.path.isdir(f"{_SEA_DATASETS_PATH}/{dirname}/") or dirname == "__pycache__": + print(f"{dirname} is not a dataloader name.") continue - print(f'({i} / {len(os.listdir(_SEA_DATASETS_PATH))}) {dirname}') - - api.create_repo( - f"SEACrowd/{dirname}", - repo_type="dataset", - exist_ok=True) - - api.upload_file( - path_or_fileobj=requirements_file, - path_in_repo="requirements.txt", - repo_id=f"SEACrowd/{dirname}", - repo_type="dataset", - ) - - license_file = BytesIO(str.encode( - import_from(f"seacrowd.sea_datasets.{dirname}.{dirname}", "_LICENSE"))) - api.upload_file( - path_or_fileobj=license_file, - path_in_repo="LICENSE", - repo_id=f"SEACrowd/{dirname}", - repo_type="dataset", - ) - - readme_file = BytesIO(str.encode(construct_readme(dirname))) - api.upload_file( - path_or_fileobj=readme_file, - path_in_repo="README.md", - repo_id=f"SEACrowd/{dirname}", - repo_type="dataset", - ) - - for dataloader_py_file in os.listdir(f"{_SEA_DATASETS_PATH}/{dirname}"): - if dataloader_py_file.endswith(".py"): - dataloader_file = f"{_SEA_DATASETS_PATH}/{dirname}/{dataloader_py_file}" - api.upload_file( - path_or_fileobj=dataloader_file, - path_in_repo=dataloader_py_file, - repo_id=f"SEACrowd/{dirname}", - repo_type="dataset", - ) \ No newline at end of file + try: + print(f'({i} / {len(os.listdir(_SEA_DATASETS_PATH))}) {dirname}') + + api.create_repo( + f"SEACrowd/{dirname}", + repo_type="dataset", + exist_ok=True) + + api.upload_file( + path_or_fileobj=requirements_file, + path_in_repo="requirements.txt", + repo_id=f"SEACrowd/{dirname}", + repo_type="dataset", + ) + + license_file = BytesIO(str.encode( + import_from(f"seacrowd.sea_datasets.{dirname}.{dirname}", "_LICENSE"))) + api.upload_file( + path_or_fileobj=license_file, + path_in_repo="LICENSE", + repo_id=f"SEACrowd/{dirname}", + repo_type="dataset", + ) + + readme_file = BytesIO(str.encode(construct_readme(dirname))) + api.upload_file( + path_or_fileobj=readme_file, + path_in_repo="README.md", + repo_id=f"SEACrowd/{dirname}", + repo_type="dataset", + ) + + for dataloader_py_file in os.listdir(f"{_SEA_DATASETS_PATH}/{dirname}"): + if dataloader_py_file.endswith(".py"): + dataloader_file = f"{_SEA_DATASETS_PATH}/{dirname}/{dataloader_py_file}" + api.upload_file( + path_or_fileobj=dataloader_file, + path_in_repo=dataloader_py_file, + repo_id=f"SEACrowd/{dirname}", + repo_type="dataset", + ) + except Exception as e: + print(f"{dirname} ======= Error: {e}") + continue \ No newline at end of file