Skip to content

Commit

Permalink
Fix some dataloaders' metadata and fix push_to_hub
Browse files Browse the repository at this point in the history
  • Loading branch information
holylovenia committed Jun 24, 2024
1 parent 7e6f76c commit e2c6207
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 61 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Southeast Asia is home to more than 1,000 native languages. Nevertheless, Southe

### Library Installation

Find seacrowd library (v0.2.0) at https://pypi.org/project/seacrowd/. (See our release notes [here](https://github.com/SEACrowd/seacrowd-datahub/releases/tag/0.2.0).)
Find seacrowd library (v0.2.2) at https://pypi.org/project/seacrowd/. (See our release notes [here](https://github.com/SEACrowd/seacrowd-datahub/releases/tag/0.2.0).)

To install SEACrowd, install the `seacrowd` package in your python environment via `pip`.

Expand Down
4 changes: 2 additions & 2 deletions seacrowd/sea_datasets/id_frog_story/id_frog_story.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks
from seacrowd.utils.constants import Tasks, Licenses

_CITATION = """\
@article{FrogStorytelling,
Expand All @@ -31,7 +31,7 @@
"""
_HOMEPAGE = "https://github.com/matbahasa/corpus-frog-storytelling"
_LANGUAGES = ["ind"]
_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
_LICENSE = Licenses.CC_BY_SA_4_0.value
_LOCAL = False
_URLS = {
_DATASETNAME: "https://github.com/matbahasa/corpus-frog-storytelling/archive/refs/heads/master.zip",
Expand Down
4 changes: 2 additions & 2 deletions seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME,
from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, Licenses,
DEFAULT_SOURCE_VIEW_NAME, Tasks)

_DATASETNAME = "jv_id_tts"
Expand Down Expand Up @@ -38,7 +38,7 @@

_HOMEPAGE = "http://openslr.org/41/"

_LICENSE = "See https://www.openslr.org/resources/41/LICENSE file for license information. Attribution-ShareAlike 4.0 (CC BY-SA 4.0)."
_LICENSE = Licenses.CC_BY_SA_4_0.value

_URLs = {
_DATASETNAME: {
Expand Down
3 changes: 2 additions & 1 deletion seacrowd/sea_datasets/ojw/ojw.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pandas as pd

from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Licenses

_CITATION = """\
@inproceedings{moeljadi-aminullah-2020-building,
Expand Down Expand Up @@ -61,7 +62,7 @@
_HOMEPAGE = "https://github.com/davidmoeljadi/OJW"


_LICENSE = "Creative Commons Attribution 4.0 International (CC BY 4.0)"
_LICENSE = Licenses.CC_BY_SA_4_0.value


_URLS = {
Expand Down
4 changes: 2 additions & 2 deletions seacrowd/sea_datasets/titml_idn/titml_idn.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
from seacrowd.utils.constants import Licenses, Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME

_DATASETNAME = "titml_idn"
_SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
Expand All @@ -31,7 +31,7 @@

_HOMEPAGE = "http://research.nii.ac.jp/src/en/TITML-IDN.html"

_LICENSE = "For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)."
_LICENSE = Licenses.OTHERS.value + " | For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)."

_URLs = {"titml-idn": "https://huggingface.co/datasets/holylovenia/TITML-IDN/resolve/main/IndoLVCSR.zip"}

Expand Down
4 changes: 2 additions & 2 deletions seacrowd/sea_datasets/unimorph_id/unimorph_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from seacrowd.utils import schemas
from seacrowd.utils.configs import SEACrowdConfig
from seacrowd.utils.constants import Tasks
from seacrowd.utils.constants import Tasks, Licenses

_CITATION = """\
@inproceedings{pimentel-ryskina-etal-2021-sigmorphon,
Expand Down Expand Up @@ -105,7 +105,7 @@

_HOMEPAGE = "https://github.com/unimorph/ind"

_LICENSE = "Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)"
_LICENSE = Licenses.CC_BY_SA_3_0.value

_URLS = {
_DATASETNAME: "https://raw.githubusercontent.com/unimorph/ind/main/ind",
Expand Down
120 changes: 69 additions & 51 deletions seacrowd/utils/push_to_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,35 @@ def construct_readme(dsetname):
citation = import_from(module_path, "_CITATION")
license = import_from(module_path, "_LICENSE")

languages_part = "\n- " + "\n- ".join([lang for lang in languages if len(lang) <= 3])
pretty_name_part = dset_name.replace("_", " ").title()
task_categories_part = "\n- " + "\n- ".join(task.name.replace("_", "-").lower() for task in supported_tasks)
readme_string = "\n---"
if "(" in license and ")" in license:
license_part = license[license.find("(")+1:license.find(")")]
readme_string = f'\n---\nlicense: {license_part}\nlanguage: {languages_part}\npretty_name: {pretty_name_part}\ntask_categories: {task_categories_part}\ntags: {task_categories_part}\n---\n'
else:
readme_string = f'\n---\nlanguage: {languages_part}\npretty_name: {pretty_name_part}\ntask_categories: {task_categories_part}\ntags: {task_categories_part}\n---\n'
readme_string += f'\n\n# {pretty_name_part}'
if license_part == "others":
license_part = "other"
readme_string += f'\nlicense: {license_part}'

languages_part = "\n- " + "\n- ".join([lang for lang in languages if len(lang) <= 3])
readme_string += f'\nlanguage: {languages_part}'

pretty_name_part = dset_name.replace("_", " ").title()
readme_string += f'\npretty_name: {pretty_name_part}'

tasks = [task.name.replace("_", "-").lower() for task in supported_tasks]
if len(tasks) > 0:
task_categories_part = "\n- " + "\n- ".join(tasks)
readme_string += f'\ntask_categories: {task_categories_part}'
readme_string += f'\ntags: {task_categories_part}'

readme_string += '\n---'

readme_string += f'\n\n{description}'
if is_local:
readme_string += "\n\nThis is a local dataset. You have to obtain this dataset separately from [{homepage}]({homepage}) to use this dataloader."

readme_string += f'\n\n## Languages\n\n{", ".join(languages)}'

readme_string += f'\n\n## Supported Tasks\n\n{", ".join([str(task.name.replace("_", " ").title()) for task in supported_tasks])}'

readme_string += f'''
\n## Dataset Usage
### Using `datasets` library
Expand All @@ -72,7 +87,7 @@ def construct_readme(dsetname):
readme_string += f'\n\n## Citation\n\nIf you are using the **{dset_name.replace("_", " ").title()}** dataloader in your work, please cite the following:'
readme_string = re.sub(r"( )+\#", "#", readme_string)
readme_string = re.sub(r"( )+\`\`\`", "```", readme_string)
readme_string = re.sub(r"( ){2, 4}", "", readme_string)
readme_string = re.sub(r"[ \t]{2,}", "", readme_string)
readme_string += f'\n```\n{citation}\n{_SEACROWD_CITATION}\n```'
readme_string = re.sub(r"( )+\@", "@", readme_string)
return readme_string
Expand All @@ -85,49 +100,52 @@ def construct_readme(dsetname):

requirements_file = BytesIO(str.encode("seacrowd>=0.2.0"))

# for dirname in ["indolem_sentiment"]:
for i, dirname in enumerate(os.listdir(_SEA_DATASETS_PATH)):
if not os.path.isdir(f"{_SEA_DATASETS_PATH}/{dirname}/"):
print(f"{dirname} is not a directory.")
if not os.path.isdir(f"{_SEA_DATASETS_PATH}/{dirname}/") or dirname == "__pycache__":
print(f"{dirname} is not a dataloader name.")
continue

print(f'({i} / {len(os.listdir(_SEA_DATASETS_PATH))}) {dirname}')

api.create_repo(
f"SEACrowd/{dirname}",
repo_type="dataset",
exist_ok=True)

api.upload_file(
path_or_fileobj=requirements_file,
path_in_repo="requirements.txt",
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)

license_file = BytesIO(str.encode(
import_from(f"seacrowd.sea_datasets.{dirname}.{dirname}", "_LICENSE")))
api.upload_file(
path_or_fileobj=license_file,
path_in_repo="LICENSE",
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)

readme_file = BytesIO(str.encode(construct_readme(dirname)))
api.upload_file(
path_or_fileobj=readme_file,
path_in_repo="README.md",
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)

for dataloader_py_file in os.listdir(f"{_SEA_DATASETS_PATH}/{dirname}"):
if dataloader_py_file.endswith(".py"):
dataloader_file = f"{_SEA_DATASETS_PATH}/{dirname}/{dataloader_py_file}"
api.upload_file(
path_or_fileobj=dataloader_file,
path_in_repo=dataloader_py_file,
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)
try:
print(f'({i} / {len(os.listdir(_SEA_DATASETS_PATH))}) {dirname}')

api.create_repo(
f"SEACrowd/{dirname}",
repo_type="dataset",
exist_ok=True)

api.upload_file(
path_or_fileobj=requirements_file,
path_in_repo="requirements.txt",
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)

license_file = BytesIO(str.encode(
import_from(f"seacrowd.sea_datasets.{dirname}.{dirname}", "_LICENSE")))
api.upload_file(
path_or_fileobj=license_file,
path_in_repo="LICENSE",
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)

readme_file = BytesIO(str.encode(construct_readme(dirname)))
api.upload_file(
path_or_fileobj=readme_file,
path_in_repo="README.md",
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)

for dataloader_py_file in os.listdir(f"{_SEA_DATASETS_PATH}/{dirname}"):
if dataloader_py_file.endswith(".py"):
dataloader_file = f"{_SEA_DATASETS_PATH}/{dirname}/{dataloader_py_file}"
api.upload_file(
path_or_fileobj=dataloader_file,
path_in_repo=dataloader_py_file,
repo_id=f"SEACrowd/{dirname}",
repo_type="dataset",
)
except Exception as e:
print(f"{dirname} ======= Error: {e}")
continue

0 comments on commit e2c6207

Please sign in to comment.