Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AM dataset - CDCP #15

Merged
merged 14 commits into from
Nov 9, 2023
5 changes: 2 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,8 @@ repos:
- id: codespell
args:
- --skip=logs/**,data/**,tests/fixtures/**
# hist: required for plotext.hist()
# ba: denotes beginning of an encoding with label as 'a'. More details at src/pie_utils/sequence_tagging/ill_formed.py
- --ignore-words-list=hist,ba
# arbitral: this is a legal term and used in example data (cdcp dataset)
- --ignore-words-list=arbitral

# python static type checking
- repo: https://github.com/pre-commit/mirrors-mypy
Expand Down
29 changes: 29 additions & 0 deletions dataset_builders/pie/cdcp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# PIE Dataset Card for "CDCP"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp).

## Data Schema

The document type for this dataset is `CDCPDocument` which defines the following data fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, optional)

and the following annotation layers:

- `propositions` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `propositions`)
- `urls` (annotation type: `Attribute`, target: `propositions`)

See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions.

## Document Converters

The dataset provides document converters for the following target document types:

- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`

See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
definitions.
142 changes: 142 additions & 0 deletions dataset_builders/pie/cdcp/cdcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import dataclasses
import logging
from typing import Any, Callable, Dict, List, Optional

import datasets
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import (
TextBasedDocument,
TextDocumentWithLabeledSpansAndBinaryRelations,
)

from pie_datasets import GeneratorBasedBuilder
from pie_datasets.document.processing.text_span_trimmer import trim_text_spans

log = logging.getLogger(__name__)


def dl2ld(dict_of_lists):
return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())]


def ld2dl(list_of_dicts, keys: Optional[List[str]] = None):
return {k: [d[k] for d in list_of_dicts] for k in keys}


@dataclasses.dataclass(frozen=True)
class Attribute(Annotation):
value: str
annotation: Annotation


@dataclasses.dataclass
class CDCPDocument(TextBasedDocument):
propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
urls: AnnotationList[Attribute] = annotation_field(target="propositions")


def example_to_document(
example: Dict[str, Any],
relation_label: datasets.ClassLabel,
proposition_label: datasets.ClassLabel,
):
document = CDCPDocument(id=example["id"], text=example["text"])
for proposition_dict in dl2ld(example["propositions"]):
proposition = LabeledSpan(
start=proposition_dict["start"],
end=proposition_dict["end"],
label=proposition_label.int2str(proposition_dict["label"]),
)
document.propositions.append(proposition)
if proposition_dict.get("url", "") != "":
url = Attribute(annotation=proposition, value=proposition_dict["url"])
document.urls.append(url)

for relation_dict in dl2ld(example["relations"]):
relation = BinaryRelation(
head=document.propositions[relation_dict["head"]],
tail=document.propositions[relation_dict["tail"]],
label=relation_label.int2str(relation_dict["label"]),
)
document.relations.append(relation)

return document


def document_to_example(
document: CDCPDocument,
relation_label: datasets.ClassLabel,
proposition_label: datasets.ClassLabel,
) -> Dict[str, Any]:
result = {"id": document.id, "text": document.text}
proposition2dict = {}
proposition2idx = {}
for idx, proposition in enumerate(document.propositions):
proposition2dict[proposition] = {
"start": proposition.start,
"end": proposition.end,
"label": proposition_label.str2int(proposition.label),
"url": "",
}
proposition2idx[proposition] = idx
for url in document.urls:
proposition2dict[url.annotation]["url"] = url.value

result["propositions"] = ld2dl(
proposition2dict.values(), keys=["start", "end", "label", "url"]
)

relations = [
{
"head": proposition2idx[relation.head],
"tail": proposition2idx[relation.tail],
"label": relation_label.str2int(relation.label),
}
for relation in document.relations
]
result["relations"] = ld2dl(relations, keys=["head", "tail", "label"])

return result


def convert_to_text_document_with_labeled_spans_and_binary_relations(
document: CDCPDocument,
verbose: bool = True,
) -> TextDocumentWithLabeledSpansAndBinaryRelations:
doc_simplified = document.as_type(
TextDocumentWithLabeledSpansAndBinaryRelations,
field_mapping={"propositions": "labeled_spans", "relations": "binary_relations"},
)
result = trim_text_spans(
doc_simplified,
layer="labeled_spans",
verbose=verbose,
)
return result


class CDCP(GeneratorBasedBuilder):
DOCUMENT_TYPE = CDCPDocument

DOCUMENT_CONVERTERS = {
TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
}

BASE_DATASET_PATH = "DFKI-SLT/cdcp"

BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]

DEFAULT_CONFIG_NAME = "default" # type: ignore

def _generate_document_kwargs(self, dataset):
return {
"relation_label": dataset.features["relations"].feature["label"],
"proposition_label": dataset.features["propositions"].feature["label"],
}

def _generate_document(self, example, relation_label, proposition_label):
return example_to_document(
example, relation_label=relation_label, proposition_label=proposition_label
)
1 change: 1 addition & 0 deletions dataset_builders/pie/cdcp/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pie-datasets>=0.3.0
12 changes: 11 additions & 1 deletion src/pie_datasets/document/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from pytorch_ie.annotations import BinaryRelation, LabeledMultiSpan, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import TextBasedDocument
from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument


@dataclasses.dataclass(eq=True, frozen=True)
Expand All @@ -28,3 +28,13 @@ class BratDocumentWithMergedSpans(TextBasedDocument):
relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
span_attributes: AnnotationList[Attribute] = annotation_field(target="spans")
relation_attributes: AnnotationList[Attribute] = annotation_field(target="relations")


@dataclasses.dataclass
class TokenDocumentWithLabeledSpans(TokenBasedDocument):
labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens")


@dataclasses.dataclass
class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans):
binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans")
Empty file.
Loading
Loading