From ac61b62dc622cda9b5c6c8f3084d32ff1c37a050 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Thu, 9 Nov 2023 10:53:59 +0100 Subject: [PATCH 1/3] add BASE_DATASET_REVISION and BASE_DATASET_REVISIONS to the PIE dataset builder --- src/pie_datasets/core/builder.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/pie_datasets/core/builder.py b/src/pie_datasets/core/builder.py index fc4773fb..adc5df10 100644 --- a/src/pie_datasets/core/builder.py +++ b/src/pie_datasets/core/builder.py @@ -40,6 +40,13 @@ class PieDatasetBuilder(datasets.builder.DatasetBuilder): # base datasets for each config. BASE_DATASET_PATHS: Dict[str, str] = {} + # The default revision (e.g. git commit) of the Huggingface dataset loading script that will be used + # as base dataset. + BASE_DATASET_REVISION: Optional[str] = None + # A mapping from config names to revisions (e.g. git commits) of the Huggingface dataset loading script + # that will be used as base dataset. + BASE_DATASET_REVISIONS: Dict[str, str] = {} + # Define kwargs to create base configs. This should contain config names as keys # and the respective config kwargs dicts as values. If the config name is not contained, a new entry # {"name": config_name} will be created for it, i.e. the config name is passed as base config name. @@ -85,6 +92,10 @@ def __init__( if self.BASE_BUILDER_KWARGS_DICT is not None: base_builder_kwargs.update(self.BASE_BUILDER_KWARGS_DICT[config_name]) + revision = self.BASE_DATASET_REVISIONS.get(config_name, self.BASE_DATASET_REVISION) + if revision is not None: + base_builder_kwargs["revision"] = revision + base_builder_kwargs.update(base_dataset_kwargs) self.base_builder = datasets.load.load_dataset_builder( path=base_dataset_path, From bf803a3510110b06789c0c01bcec6a629b028c37 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Sat, 11 Nov 2023 15:56:11 +0100 Subject: [PATCH 2/3] add BASE_DATASET_REVISIONs to existing dataset scripts --- dataset_builders/pie/brat/brat.py | 1 + dataset_builders/pie/cdcp/cdcp.py | 1 + dataset_builders/pie/conll2003/conll2003.py | 1 + 3 files changed, 3 insertions(+) diff --git a/dataset_builders/pie/brat/brat.py b/dataset_builders/pie/brat/brat.py index 6f5e39bf..60070384 100644 --- a/dataset_builders/pie/brat/brat.py +++ b/dataset_builders/pie/brat/brat.py @@ -277,6 +277,7 @@ class BratDatasetLoader(GeneratorBasedBuilder): ] BASE_DATASET_PATH = "DFKI-SLT/brat" + BASE_DATASET_REVISION = "70446e79e089d5e5cd5f3426061991a2fcfbf529" def _generate_document(self, example, **kwargs): return example_to_document( diff --git a/dataset_builders/pie/cdcp/cdcp.py b/dataset_builders/pie/cdcp/cdcp.py index 73e9c8bd..755557d2 100644 --- a/dataset_builders/pie/cdcp/cdcp.py +++ b/dataset_builders/pie/cdcp/cdcp.py @@ -125,6 +125,7 @@ class CDCP(GeneratorBasedBuilder): } BASE_DATASET_PATH = "DFKI-SLT/cdcp" + BASE_DATASET_REVISION = "45cf7a6d89866caa8a21c40edf335b88a725ecdb" BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")] diff --git a/dataset_builders/pie/conll2003/conll2003.py b/dataset_builders/pie/conll2003/conll2003.py index 51d35cb7..2ff96063 100644 --- a/dataset_builders/pie/conll2003/conll2003.py +++ b/dataset_builders/pie/conll2003/conll2003.py @@ -18,6 +18,7 @@ class Conll2003(GeneratorBasedBuilder): DOCUMENT_TYPE = CoNLL2003Document BASE_DATASET_PATH = "conll2003" + BASE_DATASET_REVISION = "01ad4ad271976c5258b9ed9b910469a806ff3288" BUILDER_CONFIGS = [ datasets.BuilderConfig( From 942944d81e5913cc68aa319fe42e8a62fd5e3da5 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Mon, 13 Nov 2023 14:50:46 +0100 Subject: [PATCH 3/3] set BASE_DATASET_REVISION for tacred --- dataset_builders/pie/tacred/tacred.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataset_builders/pie/tacred/tacred.py b/dataset_builders/pie/tacred/tacred.py index cb66046b..e1398e78 100644 --- a/dataset_builders/pie/tacred/tacred.py +++ b/dataset_builders/pie/tacred/tacred.py @@ -166,6 +166,7 @@ class Tacred(GeneratorBasedBuilder): } BASE_DATASET_PATH = "DFKI-SLT/tacred" + BASE_DATASET_REVISION = "c801dc186b40a532c5820b4662570390da90431b" BUILDER_CONFIGS = [ TacredConfig(