Skip to content

Commit

Permalink
Merge pull request #44 from ArneBinder/add_base_dataset_revision_to_b…
Browse files Browse the repository at this point in the history
…uilder

add `BASE_DATASET_REVISION` and `BASE_DATASET_REVISIONS` to the dataset builder
  • Loading branch information
ArneBinder authored Nov 13, 2023
2 parents 137c9a8 + 942944d commit 5bcdfc8
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 0 deletions.
1 change: 1 addition & 0 deletions dataset_builders/pie/brat/brat.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ class BratDatasetLoader(GeneratorBasedBuilder):
]

BASE_DATASET_PATH = "DFKI-SLT/brat"
BASE_DATASET_REVISION = "70446e79e089d5e5cd5f3426061991a2fcfbf529"

def _generate_document(self, example, **kwargs):
return example_to_document(
Expand Down
1 change: 1 addition & 0 deletions dataset_builders/pie/cdcp/cdcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class CDCP(GeneratorBasedBuilder):
}

BASE_DATASET_PATH = "DFKI-SLT/cdcp"
BASE_DATASET_REVISION = "45cf7a6d89866caa8a21c40edf335b88a725ecdb"

BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]

Expand Down
1 change: 1 addition & 0 deletions dataset_builders/pie/conll2003/conll2003.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Conll2003(GeneratorBasedBuilder):
DOCUMENT_TYPE = CoNLL2003Document

BASE_DATASET_PATH = "conll2003"
BASE_DATASET_REVISION = "01ad4ad271976c5258b9ed9b910469a806ff3288"

BUILDER_CONFIGS = [
datasets.BuilderConfig(
Expand Down
1 change: 1 addition & 0 deletions dataset_builders/pie/tacred/tacred.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class Tacred(GeneratorBasedBuilder):
}

BASE_DATASET_PATH = "DFKI-SLT/tacred"
BASE_DATASET_REVISION = "c801dc186b40a532c5820b4662570390da90431b"

BUILDER_CONFIGS = [
TacredConfig(
Expand Down
11 changes: 11 additions & 0 deletions src/pie_datasets/core/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ class PieDatasetBuilder(datasets.builder.DatasetBuilder):
# base datasets for each config.
BASE_DATASET_PATHS: Dict[str, str] = {}

# The default revision (e.g. git commit) of the Huggingface dataset loading script that will be used
# as base dataset.
BASE_DATASET_REVISION: Optional[str] = None
# A mapping from config names to revisions (e.g. git commits) of the Huggingface dataset loading script
# that will be used as base dataset.
BASE_DATASET_REVISIONS: Dict[str, str] = {}

# Define kwargs to create base configs. This should contain config names as keys
# and the respective config kwargs dicts as values. If the config name is not contained, a new entry
# {"name": config_name} will be created for it, i.e. the config name is passed as base config name.
Expand Down Expand Up @@ -85,6 +92,10 @@ def __init__(
if self.BASE_BUILDER_KWARGS_DICT is not None:
base_builder_kwargs.update(self.BASE_BUILDER_KWARGS_DICT[config_name])

revision = self.BASE_DATASET_REVISIONS.get(config_name, self.BASE_DATASET_REVISION)
if revision is not None:
base_builder_kwargs["revision"] = revision

base_builder_kwargs.update(base_dataset_kwargs)
self.base_builder = datasets.load.load_dataset_builder(
path=base_dataset_path,
Expand Down

0 comments on commit 5bcdfc8

Please sign in to comment.