Skip to content

Commit

Permalink
fix from comments
Browse files Browse the repository at this point in the history
  • Loading branch information
idalr committed Nov 7, 2023
1 parent 481f332 commit ceef7cd
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 55 deletions.
29 changes: 29 additions & 0 deletions dataset_builders/pie/cdcp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# PIE Dataset Card for "CDCP"

This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp).

## Data Schema

The document type for this dataset is `CDCPDocument` which defines the following data fields:

- `text` (str)
- `id` (str, optional)
- `metadata` (dictionary, dataclasses)

and the following annotation layers:

- `propositions` (annotation type: `LabeledSpan`, target: `text`)
- `relations` (annotation type: `BinaryRelation`, target: `propositions`)
- `urls` (annotation type: `Attribute`, target: `propositions`)

See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions.

## Document Converters

The dataset provides document converters for the following target document types:

- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`

See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
definitions.
31 changes: 7 additions & 24 deletions dataset_builders/pie/cdcp/cdcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
import datasets
import pytorch_ie.data.builder
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field
from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
from pytorch_ie.core import Annotation, AnnotationList, annotation_field
from pytorch_ie.documents import (
TextBasedDocument,
TextDocumentWithLabeledSpansAndBinaryRelations,
)

from pie_datasets.document.processing.text_span_trimmer import trim_text_spans

Expand All @@ -18,12 +21,7 @@ def dl2ld(dict_of_lists):


def ld2dl(list_of_dicts, keys: Optional[List[str]] = None, as_list: bool = False):
if keys is None:
keys = list_of_dicts[0].keys()
if as_list:
return [[d[k] for d in list_of_dicts] for k in keys]
else:
return {k: [d[k] for d in list_of_dicts] for k in keys}
return {k: [d[k] for d in list_of_dicts] for k in keys}


@dataclasses.dataclass(frozen=True)
Expand All @@ -33,10 +31,7 @@ class Attribute(Annotation):


@dataclasses.dataclass
class CDCPDocument(Document):
text: str
id: Optional[str] = None
metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
class CDCPDocument(TextBasedDocument):
propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
urls: AnnotationList[Attribute] = annotation_field(target="propositions")
Expand Down Expand Up @@ -122,18 +117,6 @@ def convert_to_text_document_with_labeled_spans_and_binary_relations(
return result


class CDCPConfig(datasets.BuilderConfig):
"""BuilderConfig for CDCP."""

def __init__(self, **kwargs):
"""BuilderConfig for CDCP.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


class CDCP(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = CDCPDocument

Expand Down
31 changes: 0 additions & 31 deletions src/pie_datasets/document/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,34 +40,3 @@ class TokenDocumentWithLabeledSpans(TokenBasedDocument):
@dataclasses.dataclass
class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans):
binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans")


@dataclasses.dataclass
class TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
TokenDocumentWithLabeledSpansAndBinaryRelations
):
labeled_partitions: AnnotationList[LabeledSpan] = annotation_field(target="tokens")


@dataclasses.dataclass
class BratDocument(Document):
# copied from https://huggingface.co/datasets/pie/brat/blob/main/brat.py
text: str
id: Optional[str] = None
metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text")
relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
span_attributions: AnnotationList[Attribute] = annotation_field(target="spans")
relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations")


@dataclasses.dataclass
class BratDocumentWithMergedSpans(Document):
# copied from https://huggingface.co/datasets/pie/brat/blob/main/brat.py
text: str
id: Optional[str] = None
metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
spans: AnnotationList[LabeledSpan] = annotation_field(target="text")
relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
span_attributions: AnnotationList[Attribute] = annotation_field(target="spans")
relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations")

0 comments on commit ceef7cd

Please sign in to comment.