fix from comments

ArneBinder · Nov 7, 2023 · ceef7cd · ceef7cd
1 parent 481f332
commit ceef7cd
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 55 deletions.
diff --git a/dataset_builders/pie/cdcp/README.md b/dataset_builders/pie/cdcp/README.md
@@ -0,0 +1,29 @@
+# PIE Dataset Card for "CDCP"
+
+This is a [PyTorch-IE](https://github.com/ChristophAlt/pytorch-ie) wrapper for the
+[CDCP Huggingface dataset loading script](https://huggingface.co/datasets/DFKI-SLT/cdcp).
+
+## Data Schema
+
+The document type for this dataset is `CDCPDocument` which defines the following data fields:
+
+- `text` (str)
+- `id` (str, optional)
+- `metadata` (dictionary, dataclasses)
+
+and the following annotation layers:
+
+- `propositions` (annotation type: `LabeledSpan`, target: `text`)
+- `relations` (annotation type: `BinaryRelation`, target: `propositions`)
+- `urls` (annotation type: `Attribute`, target: `propositions`)
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/annotations.py) for the annotation type definitions.
+
+## Document Converters
+
+The dataset provides document converters for the following target document types:
+
+- `pytorch_ie.documents.TextDocumentWithLabeledSpansAndBinaryRelations`
+
+See [here](https://github.com/ChristophAlt/pytorch-ie/blob/main/src/pytorch_ie/documents.py) for the document type
+definitions.
diff --git a/dataset_builders/pie/cdcp/cdcp.py b/dataset_builders/pie/cdcp/cdcp.py
@@ -5,8 +5,11 @@
 import datasets
 import pytorch_ie.data.builder
 from pytorch_ie.annotations import BinaryRelation, LabeledSpan
-from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field
-from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations
+from pytorch_ie.core import Annotation, AnnotationList, annotation_field
+from pytorch_ie.documents import (
+    TextBasedDocument,
+    TextDocumentWithLabeledSpansAndBinaryRelations,
+)
 
 from pie_datasets.document.processing.text_span_trimmer import trim_text_spans
 
@@ -18,12 +21,7 @@ def dl2ld(dict_of_lists):
 
 
 def ld2dl(list_of_dicts, keys: Optional[List[str]] = None, as_list: bool = False):
-    if keys is None:
-        keys = list_of_dicts[0].keys()
-    if as_list:
-        return [[d[k] for d in list_of_dicts] for k in keys]
-    else:
-        return {k: [d[k] for d in list_of_dicts] for k in keys}
+    return {k: [d[k] for d in list_of_dicts] for k in keys}
 
 
 @dataclasses.dataclass(frozen=True)
@@ -33,10 +31,7 @@ class Attribute(Annotation):
 
 
 @dataclasses.dataclass
-class CDCPDocument(Document):
-    text: str
-    id: Optional[str] = None
-    metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
+class CDCPDocument(TextBasedDocument):
     propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
     relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
     urls: AnnotationList[Attribute] = annotation_field(target="propositions")
@@ -122,18 +117,6 @@ def convert_to_text_document_with_labeled_spans_and_binary_relations(
     return result
 
 
-class CDCPConfig(datasets.BuilderConfig):
-    """BuilderConfig for CDCP."""
-
-    def __init__(self, **kwargs):
-        """BuilderConfig for CDCP.
-
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super().__init__(**kwargs)
-
-
 class CDCP(pytorch_ie.data.builder.GeneratorBasedBuilder):
     DOCUMENT_TYPE = CDCPDocument
 

diff --git a/src/pie_datasets/document/types.py b/src/pie_datasets/document/types.py
@@ -40,34 +40,3 @@ class TokenDocumentWithLabeledSpans(TokenBasedDocument):
 @dataclasses.dataclass
 class TokenDocumentWithLabeledSpansAndBinaryRelations(TokenDocumentWithLabeledSpans):
     binary_relations: AnnotationList[BinaryRelation] = annotation_field(target="labeled_spans")
-
-
-@dataclasses.dataclass
-class TokenDocumentWithLabeledSpansBinaryRelationsAndLabeledPartitions(
-    TokenDocumentWithLabeledSpansAndBinaryRelations
-):
-    labeled_partitions: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
-
-
-@dataclasses.dataclass
-class BratDocument(Document):
-    # copied from https://huggingface.co/datasets/pie/brat/blob/main/brat.py
-    text: str
-    id: Optional[str] = None
-    metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    spans: AnnotationList[LabeledMultiSpan] = annotation_field(target="text")
-    relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
-    span_attributions: AnnotationList[Attribute] = annotation_field(target="spans")
-    relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations")
-
-
-@dataclasses.dataclass
-class BratDocumentWithMergedSpans(Document):
-    # copied from https://huggingface.co/datasets/pie/brat/blob/main/brat.py
-    text: str
-    id: Optional[str] = None
-    metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    spans: AnnotationList[LabeledSpan] = annotation_field(target="text")
-    relations: AnnotationList[BinaryRelation] = annotation_field(target="spans")
-    span_attributions: AnnotationList[Attribute] = annotation_field(target="spans")
-    relation_attributions: AnnotationList[Attribute] = annotation_field(target="relations")