Skip to content

Commit

Permalink
add all PIE dataset builder scripts from PyTorch-IE
Browse files Browse the repository at this point in the history
  • Loading branch information
ArneBinder committed Nov 7, 2023
1 parent 679306c commit 845a5fa
Show file tree
Hide file tree
Showing 199 changed files with 633 additions and 0 deletions.
57 changes: 57 additions & 0 deletions dataset_builders/pie/conll2002/conll2002.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from dataclasses import dataclass

import datasets

import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans


class Conll2002Config(datasets.BuilderConfig):
"""BuilderConfig for CoNLL2002"""

def __init__(self, **kwargs):
"""BuilderConfig for CoNLL2002.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


@dataclass
class CoNLL2002Document(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")


class Conll2003(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = CoNLL2002Document

BASE_DATASET_PATH = "conll2002"

BUILDER_CONFIGS = [
Conll2002Config(
name="es", version=datasets.Version("1.0.0"), description="CoNLL2002 Spanish dataset"
),
Conll2002Config(
name="nl", version=datasets.Version("1.0.0"), description="CoNLL2002 Dutch dataset"
),
]

def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}

def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]

text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)

document = CoNLL2002Document(text=text, id=doc_id)

for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)

return document
Binary file not shown.
Binary file not shown.
54 changes: 54 additions & 0 deletions dataset_builders/pie/conllpp/conllpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from dataclasses import dataclass

import datasets

import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans


class CoNLLppConfig(datasets.BuilderConfig):
"""BuilderConfig for CoNLLpp"""

def __init__(self, **kwargs):
"""BuilderConfig for CoNLLpp.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


@dataclass
class CoNLLppDocument(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")


class CoNLLpp(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = CoNLLppDocument

BASE_DATASET_PATH = "conllpp"

BUILDER_CONFIGS = [
CoNLLppConfig(
name="conllpp", version=datasets.Version("1.0.0"), description="CoNLLpp dataset"
),
]

def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}

def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]

text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)

document = CoNLLppDocument(text=text, id=doc_id)

for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)

return document
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from dataclasses import dataclass

import datasets

import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans

_VERSION = "1.0.0"
_COURTS = ["bag", "bfh", "bgh", "bpatg", "bsg", "bverfg", "bverwg"]
_COURTS_FILEPATHS = {court: f"{court}.conll" for court in _COURTS}
_ALL = "all"


class GermanLegalEntityRecognitionConfig(datasets.BuilderConfig):
def __init__(self, *args, courts=None, **kwargs):
super().__init__(*args, version=datasets.Version(_VERSION, ""), **kwargs)
self.courts = courts

@property
def filepaths(self):
return [_COURTS_FILEPATHS[court] for court in self.courts]


@dataclass
class GermanLegalEntityRecognitionDocument(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")


class GermanLegalEntityRecognition(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = GermanLegalEntityRecognitionDocument

BASE_DATASET_PATH = "german_legal_entity_recognition"

BUILDER_CONFIGS = [
GermanLegalEntityRecognitionConfig(
name=court, courts=[court], description=f"Court. {court}."
)
for court in _COURTS
] + [
GermanLegalEntityRecognitionConfig(
name=_ALL, courts=_COURTS, description="All courts included."
)
]
BUILDER_CONFIG_CLASS = GermanLegalEntityRecognitionConfig
DEFAULT_CONFIG_NAME = _ALL # type: ignore

def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}

def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]

text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)

document = GermanLegalEntityRecognitionDocument(text=text, id=doc_id)

for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)

return document
Binary file not shown.
57 changes: 57 additions & 0 deletions dataset_builders/pie/germaner/germaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from dataclasses import dataclass

import datasets

import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans


class GermaNERConfig(datasets.BuilderConfig):
"""BuilderConfig for GermaNER."""

def __init__(self, **kwargs):
"""BuilderConfig for GermaNER.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


@dataclass
class GermaNERDocument(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")


class GermaNER(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = GermaNERDocument

BASE_DATASET_PATH = "germaner"

BUILDER_CONFIGS = [
GermaNERConfig(
name="germaner",
version=datasets.Version("0.9.1"),
description="GermaNER dataset",
),
]

def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}

def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]

text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)

document = GermaNERDocument(text=text, id=doc_id)

for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)

return document
Binary file not shown.
60 changes: 60 additions & 0 deletions dataset_builders/pie/germeval_14/germeval_14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from dataclasses import dataclass

import datasets

import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans


class GermEval14Config(datasets.BuilderConfig):
"""BuilderConfig for GermEval 2014."""

def __init__(self, **kwargs):
"""BuilderConfig for GermEval 2014.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


@dataclass
class GermEval14Document(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")


class GermEval14(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = GermEval14Document

BASE_DATASET_PATH = "germeval_14"

BUILDER_CONFIGS = [
GermEval14Config(
name="germeval_14",
version=datasets.Version("2.0.0"),
description="GermEval 2014 NER Shared Task dataset",
),
]

def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}

def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]
nested_ner_tags = [int_to_str(tag) for tag in example["nested_ner_tags"]]

text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)
_, nested_ner_tags = tokens_and_tags_to_text_and_labeled_spans(
tokens=tokens, tags=nested_ner_tags
)

document = GermEval14Document(text=text, id=doc_id)

for span in sorted(ner_spans + nested_ner_tags, key=lambda span: span.start):
document.entities.append(span)

return document
Binary file not shown.
56 changes: 56 additions & 0 deletions dataset_builders/pie/ncbi_disease/ncbi_disease.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from dataclasses import dataclass

import datasets

import pytorch_ie.data.builder
from pytorch_ie.annotations import LabeledSpan
from pytorch_ie.core import AnnotationList, annotation_field
from pytorch_ie.documents import TextDocument
from pytorch_ie.utils.span import tokens_and_tags_to_text_and_labeled_spans


class NCBIDiseaseConfig(datasets.BuilderConfig):
"""BuilderConfig for NCBIDisease"""

def __init__(self, **kwargs):
"""BuilderConfig for NCBIDisease.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


@dataclass
class NCBIDiseaseDocument(TextDocument):
entities: AnnotationList[LabeledSpan] = annotation_field(target="text")


class NCBIDisease(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = NCBIDiseaseDocument

BASE_DATASET_PATH = "ncbi_disease"

BUILDER_CONFIGS = [
NCBIDiseaseConfig(
name="ncbi_disease",
version=datasets.Version("1.0.0"),
description="NCBIDisease dataset",
),
]

def _generate_document_kwargs(self, dataset):
return {"int_to_str": dataset.features["ner_tags"].feature.int2str}

def _generate_document(self, example, int_to_str):
doc_id = example["id"]
tokens = example["tokens"]
ner_tags = [int_to_str(tag) for tag in example["ner_tags"]]

text, ner_spans = tokens_and_tags_to_text_and_labeled_spans(tokens=tokens, tags=ner_tags)

document = NCBIDiseaseDocument(text=text, id=doc_id)

for span in sorted(ner_spans, key=lambda span: span.start):
document.entities.append(span)

return document
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 845a5fa

Please sign in to comment.