-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add all PIE dataset builder scripts from PyTorch-IE
- Loading branch information
1 parent
679306c
commit 845a5fa
Showing
199 changed files
with
633 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from dataclasses import dataclass | ||
|
||
import datasets | ||
|
||
import pytorch_ie.data.builder | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.core import AnnotationList, annotation_field | ||
from pytorch_ie.documents import TextDocument | ||
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans | ||
|
||
|
||
class Conll2002Config(datasets.BuilderConfig): | ||
"""BuilderConfig for CoNLL2002""" | ||
|
||
def __init__(self, **kwargs): | ||
"""BuilderConfig for CoNLL2002. | ||
Args: | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
|
||
@dataclass | ||
class CoNLL2002Document(TextDocument): | ||
entities: AnnotationList[LabeledSpan] = annotation_field(target="text") | ||
|
||
|
||
class Conll2003(pytorch_ie.data.builder.GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = CoNLL2002Document | ||
|
||
BASE_DATASET_PATH = "conll2002" | ||
|
||
BUILDER_CONFIGS = [ | ||
Conll2002Config( | ||
name="es", version=datasets.Version("1.0.0"), description="CoNLL2002 Spanish dataset" | ||
), | ||
Conll2002Config( | ||
name="nl", version=datasets.Version("1.0.0"), description="CoNLL2002 Dutch dataset" | ||
), | ||
] | ||
|
||
def _generate_document_kwargs(self, dataset): | ||
return {"int_to_str": dataset.features["ner_tags"].feature.int2str} | ||
|
||
def _generate_document(self, example, int_to_str): | ||
doc_id = example["id"] | ||
tokens = example["tokens"] | ||
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]] | ||
|
||
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags) | ||
|
||
document = CoNLL2002Document(text=text, id=doc_id) | ||
|
||
for span in sorted(ner_spans, key=lambda span: span.start): | ||
document.entities.append(span) | ||
|
||
return document |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from dataclasses import dataclass | ||
|
||
import datasets | ||
|
||
import pytorch_ie.data.builder | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.core import AnnotationList, annotation_field | ||
from pytorch_ie.documents import TextDocument | ||
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans | ||
|
||
|
||
class CoNLLppConfig(datasets.BuilderConfig): | ||
"""BuilderConfig for CoNLLpp""" | ||
|
||
def __init__(self, **kwargs): | ||
"""BuilderConfig for CoNLLpp. | ||
Args: | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
|
||
@dataclass | ||
class CoNLLppDocument(TextDocument): | ||
entities: AnnotationList[LabeledSpan] = annotation_field(target="text") | ||
|
||
|
||
class CoNLLpp(pytorch_ie.data.builder.GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = CoNLLppDocument | ||
|
||
BASE_DATASET_PATH = "conllpp" | ||
|
||
BUILDER_CONFIGS = [ | ||
CoNLLppConfig( | ||
name="conllpp", version=datasets.Version("1.0.0"), description="CoNLLpp dataset" | ||
), | ||
] | ||
|
||
def _generate_document_kwargs(self, dataset): | ||
return {"int_to_str": dataset.features["ner_tags"].feature.int2str} | ||
|
||
def _generate_document(self, example, int_to_str): | ||
doc_id = example["id"] | ||
tokens = example["tokens"] | ||
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]] | ||
|
||
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags) | ||
|
||
document = CoNLLppDocument(text=text, id=doc_id) | ||
|
||
for span in sorted(ner_spans, key=lambda span: span.start): | ||
document.entities.append(span) | ||
|
||
return document |
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/all/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bag/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bfh/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bgh/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bpatg/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bsg/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bverfg/1.0.0/dummy_data.zip
Binary file not shown.
Binary file added
BIN
+1.63 KB
dataset_builders/pie/german_legal_entity_recognition/dummy/bverwg/1.0.0/dummy_data.zip
Binary file not shown.
65 changes: 65 additions & 0 deletions
65
dataset_builders/pie/german_legal_entity_recognition/german_legal_entity_recognition.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from dataclasses import dataclass | ||
|
||
import datasets | ||
|
||
import pytorch_ie.data.builder | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.core import AnnotationList, annotation_field | ||
from pytorch_ie.documents import TextDocument | ||
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans | ||
|
||
_VERSION = "1.0.0" | ||
_COURTS = ["bag", "bfh", "bgh", "bpatg", "bsg", "bverfg", "bverwg"] | ||
_COURTS_FILEPATHS = {court: f"{court}.conll" for court in _COURTS} | ||
_ALL = "all" | ||
|
||
|
||
class GermanLegalEntityRecognitionConfig(datasets.BuilderConfig): | ||
def __init__(self, *args, courts=None, **kwargs): | ||
super().__init__(*args, version=datasets.Version(_VERSION, ""), **kwargs) | ||
self.courts = courts | ||
|
||
@property | ||
def filepaths(self): | ||
return [_COURTS_FILEPATHS[court] for court in self.courts] | ||
|
||
|
||
@dataclass | ||
class GermanLegalEntityRecognitionDocument(TextDocument): | ||
entities: AnnotationList[LabeledSpan] = annotation_field(target="text") | ||
|
||
|
||
class GermanLegalEntityRecognition(pytorch_ie.data.builder.GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = GermanLegalEntityRecognitionDocument | ||
|
||
BASE_DATASET_PATH = "german_legal_entity_recognition" | ||
|
||
BUILDER_CONFIGS = [ | ||
GermanLegalEntityRecognitionConfig( | ||
name=court, courts=[court], description=f"Court. {court}." | ||
) | ||
for court in _COURTS | ||
] + [ | ||
GermanLegalEntityRecognitionConfig( | ||
name=_ALL, courts=_COURTS, description="All courts included." | ||
) | ||
] | ||
BUILDER_CONFIG_CLASS = GermanLegalEntityRecognitionConfig | ||
DEFAULT_CONFIG_NAME = _ALL # type: ignore | ||
|
||
def _generate_document_kwargs(self, dataset): | ||
return {"int_to_str": dataset.features["ner_tags"].feature.int2str} | ||
|
||
def _generate_document(self, example, int_to_str): | ||
doc_id = example["id"] | ||
tokens = example["tokens"] | ||
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]] | ||
|
||
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags) | ||
|
||
document = GermanLegalEntityRecognitionDocument(text=text, id=doc_id) | ||
|
||
for span in sorted(ner_spans, key=lambda span: span.start): | ||
document.entities.append(span) | ||
|
||
return document |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from dataclasses import dataclass | ||
|
||
import datasets | ||
|
||
import pytorch_ie.data.builder | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.core import AnnotationList, annotation_field | ||
from pytorch_ie.documents import TextDocument | ||
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans | ||
|
||
|
||
class GermaNERConfig(datasets.BuilderConfig): | ||
"""BuilderConfig for GermaNER.""" | ||
|
||
def __init__(self, **kwargs): | ||
"""BuilderConfig for GermaNER. | ||
Args: | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
|
||
@dataclass | ||
class GermaNERDocument(TextDocument): | ||
entities: AnnotationList[LabeledSpan] = annotation_field(target="text") | ||
|
||
|
||
class GermaNER(pytorch_ie.data.builder.GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = GermaNERDocument | ||
|
||
BASE_DATASET_PATH = "germaner" | ||
|
||
BUILDER_CONFIGS = [ | ||
GermaNERConfig( | ||
name="germaner", | ||
version=datasets.Version("0.9.1"), | ||
description="GermaNER dataset", | ||
), | ||
] | ||
|
||
def _generate_document_kwargs(self, dataset): | ||
return {"int_to_str": dataset.features["ner_tags"].feature.int2str} | ||
|
||
def _generate_document(self, example, int_to_str): | ||
doc_id = example["id"] | ||
tokens = example["tokens"] | ||
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]] | ||
|
||
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags) | ||
|
||
document = GermaNERDocument(text=text, id=doc_id) | ||
|
||
for span in sorted(ner_spans, key=lambda span: span.start): | ||
document.entities.append(span) | ||
|
||
return document |
Binary file added
BIN
+12.3 KB
dataset_builders/pie/germeval_14/dummy/germeval_14/2.0.0/dummy_data.zip
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from dataclasses import dataclass | ||
|
||
import datasets | ||
|
||
import pytorch_ie.data.builder | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.core import AnnotationList, annotation_field | ||
from pytorch_ie.documents import TextDocument | ||
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans | ||
|
||
|
||
class GermEval14Config(datasets.BuilderConfig): | ||
"""BuilderConfig for GermEval 2014.""" | ||
|
||
def __init__(self, **kwargs): | ||
"""BuilderConfig for GermEval 2014. | ||
Args: | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
|
||
@dataclass | ||
class GermEval14Document(TextDocument): | ||
entities: AnnotationList[LabeledSpan] = annotation_field(target="text") | ||
|
||
|
||
class GermEval14(pytorch_ie.data.builder.GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = GermEval14Document | ||
|
||
BASE_DATASET_PATH = "germeval_14" | ||
|
||
BUILDER_CONFIGS = [ | ||
GermEval14Config( | ||
name="germeval_14", | ||
version=datasets.Version("2.0.0"), | ||
description="GermEval 2014 NER Shared Task dataset", | ||
), | ||
] | ||
|
||
def _generate_document_kwargs(self, dataset): | ||
return {"int_to_str": dataset.features["ner_tags"].feature.int2str} | ||
|
||
def _generate_document(self, example, int_to_str): | ||
doc_id = example["id"] | ||
tokens = example["tokens"] | ||
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]] | ||
nested_ner_tags = [int_to_str(tag) for tag in example["nested_ner_tags"]] | ||
|
||
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags) | ||
_, nested_ner_tags = tokens_and_tags_to_text_and_labeled_spans( | ||
tokens=tokens, tags=nested_ner_tags | ||
) | ||
|
||
document = GermEval14Document(text=text, id=doc_id) | ||
|
||
for span in sorted(ner_spans + nested_ner_tags, key=lambda span: span.start): | ||
document.entities.append(span) | ||
|
||
return document |
Binary file added
BIN
+577 Bytes
dataset_builders/pie/ncbi_disease/dummy/ncbi_disease/1.0.0/dummy_data.zip
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from dataclasses import dataclass | ||
|
||
import datasets | ||
|
||
import pytorch_ie.data.builder | ||
from pytorch_ie.annotations import LabeledSpan | ||
from pytorch_ie.core import AnnotationList, annotation_field | ||
from pytorch_ie.documents import TextDocument | ||
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans | ||
|
||
|
||
class NCBIDiseaseConfig(datasets.BuilderConfig): | ||
"""BuilderConfig for NCBIDisease""" | ||
|
||
def __init__(self, **kwargs): | ||
"""BuilderConfig for NCBIDisease. | ||
Args: | ||
**kwargs: keyword arguments forwarded to super. | ||
""" | ||
super().__init__(**kwargs) | ||
|
||
|
||
@dataclass | ||
class NCBIDiseaseDocument(TextDocument): | ||
entities: AnnotationList[LabeledSpan] = annotation_field(target="text") | ||
|
||
|
||
class NCBIDisease(pytorch_ie.data.builder.GeneratorBasedBuilder): | ||
DOCUMENT_TYPE = NCBIDiseaseDocument | ||
|
||
BASE_DATASET_PATH = "ncbi_disease" | ||
|
||
BUILDER_CONFIGS = [ | ||
NCBIDiseaseConfig( | ||
name="ncbi_disease", | ||
version=datasets.Version("1.0.0"), | ||
description="NCBIDisease dataset", | ||
), | ||
] | ||
|
||
def _generate_document_kwargs(self, dataset): | ||
return {"int_to_str": dataset.features["ner_tags"].feature.int2str} | ||
|
||
def _generate_document(self, example, int_to_str): | ||
doc_id = example["id"] | ||
tokens = example["tokens"] | ||
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]] | ||
|
||
text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags) | ||
|
||
document = NCBIDiseaseDocument(text=text, id=doc_id) | ||
|
||
for span in sorted(ner_spans, key=lambda span: span.start): | ||
document.entities.append(span) | ||
|
||
return document |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+870 Bytes
dataset_builders/pie/wikiann/dummy/zh-classical/1.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.