From 87295838064b601ac8042419fb5f22bfcdea54e2 Mon Sep 17 00:00:00 2001
From: Ruangrin L <88072261+idalr@users.noreply.github.com>
Date: Thu, 9 Nov 2023 16:03:11 +0100
Subject: [PATCH] edit types.py, cdcp.py, test_cdcp.py, add requirements.txt
---
dataset_builders/hf/cdcp/README.md | 135 ---------------------
dataset_builders/pie/cdcp/cdcp.py | 8 +-
dataset_builders/pie/cdcp/requirements.txt | 1 +
src/pie_datasets/document/types.py | 22 +---
tests/dataset_builders/pie/test_cdcp.py | 4 +-
5 files changed, 8 insertions(+), 162 deletions(-)
delete mode 100644 dataset_builders/hf/cdcp/README.md
create mode 100644 dataset_builders/pie/cdcp/requirements.txt
diff --git a/dataset_builders/hf/cdcp/README.md b/dataset_builders/hf/cdcp/README.md
deleted file mode 100644
index 2d48f1d9..00000000
--- a/dataset_builders/hf/cdcp/README.md
+++ /dev/null
@@ -1,135 +0,0 @@
-# Dataset Card for "CDCP"
-
-### Dataset Summary
-
-CDCP (a.k.a. *Cornell eRulemaking Corpus*; [Park and Cardie, 2018](https://aclanthology.org/L18-1257.pdf)) consists of 731 user comments from an eRulemaking platform in the English language. There are five types of components (`Fact`, `Testimony`, `Reference`, `Value`, and `Policy`) and two types of supporting relations (`Reason` and `Evidence`) are annotated on the basis of the study by Park et al. (2015). The resulting dataset contains 4931 elementary unit and 1221 support relation annotations. (pp. 1623-1624)
-
-### Supported Tasks and Leaderboards
-
-- **Tasks:** Argument Minning, Link Prediction, Component Classification, Relation Classification
-- **Leaderboards:** https://paperswithcode.com/dataset/cdcp
-
-### Languages
-
-The language in the dataset is English (AmE).
-
-## Dataset Structure
-
-### Data Instances
-
-- **Size of downloaded dataset files:** 5.37 MB
-
-```
-{
- 'id': "00195",
- 'text': "State and local court rules sometimes make default judgments much more likely. For example, when a person who allegedly owes a debt is told to come to court on a work day, they may be forced to choose between a default judgment and their job. I urge the CFPB to find practices that involve scheduling hearings at inconvenient times unfair, deceptive, and abusive, or inconsistent with 1692i",
- 'proposition': {
- "start": [0, 78, 242],
- "end": [78, 242, 391],
- "label": [4, 4, 1],
- "url": ["", "", ""],
- },
- 'relations': {"head": [0, 2], "tail": [1, 0], "label": [1, 1]},
-}
-```
-
-### Data Fields
-
-- `id`: the instance id of the text, a `string` feature
-- `text`: the text (with URLs marked as `__URL__`), a `string` feature
-- `proposition`: the annotation list of spans with labels and URL (if applicable), a `dictionary` feature
- - `start`: the indices indicating the inclusive start of the spans, a `list` of `int` feature
- - `end`: the indices indicating the exclusive end of the spans, a `list` of `int` feature
- - `label`: the indices indicating the span type, a `list` of `int` feature (see [label list](https://huggingface.co/datasets/DFKI-SLT/cdcp/blob/main/cdcp.py#L40))
- - `urls`: the URLs link with corresponding indices to each proposition, a `list` of `str` feature
-- `relation`: the relation between labeled spans with relation labels, a `dictionary` feature
- - `head`: the indices indicating the first element in a relation, a `list` of `int` feature
- - `tail`: the indices indicating the second element in a relation, a `list` of `int` feature
- - `label`: the indices indicating the relation type in a relation, a `list` of `int` feature (see [label list](https://huggingface.co/datasets/DFKI-SLT/cdcp/blob/main/cdcp.py#L41))
-
-### Data Splits
-
-| | train | test |
-| ------------------------------------------------------------------------------------------------ | ---------------------------------------: | -------------------------------------: |
-| No. of instances | 581 | 150 |
-| No. of span labels
- `Fact`
- `Testimony`
- `Reference`
- `Value`
- `Policy` |
654
873
31
1686
662 |
132
244
1
496
153 |
-| No. of relation labels
- `reason`
- `evidence` |
1055
47 |
298
26 |
-
-## Dataset Creation
-
-### Curation Rationale
-
-"eRulemaking is a means for government agencies to directly reach citizens to solicit their opinions and experiences regarding newly proposed rules. The effort, however, is partly hampered by citizens’ comments that lack reasoning and evidence, which are largely ignored since government agencies are unable to evaluate the validity and strength." (p. 1623)
-
-"It will be a valuable resource for building argument mining systems that can not only extract arguments from unstructured text, but also identify ways in which a given argument can be improved with respect to its evaluability." (p. 1624)
-
-### Source Data
-
-eRulemaking comments (see [eRulemaking](https://www.gsa.gov/about-us/organization/federal-acquisition-service/technology-transformation-services/erulemaking))
-
-#### Initial Data Collection and Normalization
-
-"Annotated 731 user comments on Consumer Debt Collection Practices (CDCP) rule by the Consumer Financial Protection Bureau (CFPB) posted on www.regulationroom.org." (p. 1624)
-
-#### Who are the source language producers?
-
-General public participants, implying American citizens.
-
-"According to a voluntary user survey that asked the commenters to self-identify themselves, about 64% of the comments came from consumers, 22% from debt collectors, and the remainder from others, such as consumer advocates and counsellor organizations." (p. 1624)
-
-### Annotations
-
-#### Annotation process
-
-"The annotators annotated the elementary units and support relations defined in the argumentation model proposed by [Park et al. (2015)](https://dl.acm.org/doi/10.1145/2746090.2746118)."
-
-"Each user comment was annotated by two annotators, who independently determined the types of elementary units and support relations among them using the GATE annotation tool (Cunningham et al., 2011). A third annotator manually resolved the conflicts to produce the final dataset."
-
-"Inter-annotator agreement between 2 annotators is measured with Krippendorf’s α with respect to elementary unit type (α=64.8%) and support relations (α=44.1%); IDs of supported elementary units are treated as labels for the supporting elementary units." (p. 1626)
-
-#### Who are the annotators?
-
-\[More Information Needed\]
-
-### Personal and Sensitive Information
-
-\[More Information Needed\]
-
-## Considerations for Using the Data
-
-### Social Impact of Dataset
-
-"Immediate applications include automatically ranking arguments based on their evaluability for a (crude) identification of read-worthy comments and providing real-time feedback to writers, specifying which types of support for which propositions can be added to construct better-formed arguments." (p. 1624)
-
-### Discussion of Biases
-
-About 45% of the elementary units are `VALUE` type. A significant portion, roughly 75%, of support relation annotations are between adjacent elementary units. While commenters certainly tend to provide reasons immediately after the proposition to be supported, it is also easier for annotators to identify support relations in proximity. Thus, support relations in the wild may be not as skewed toward those between adjacent elementary units. (pp. 1626-1627)
-
-### Other Known Limitations
-
-\[More Information Needed\]
-
-## Additional Information
-
-### Dataset Curators
-
-\[More Information Needed\]
-
-### Licensing Information
-
-\[More Information Needed\]
-
-### Citation Information
-
-```
-@inproceedings{park2018corpus,
- title={A corpus of erulemaking user comments for measuring evaluability of arguments},
- author={Park, Joonsuk and Cardie, Claire},
- booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)},
- year={2018}
-}
-```
-
-### Contributions
-
-Thanks to [@idalr](https://github.com/idalr) for adding this dataset.
diff --git a/dataset_builders/pie/cdcp/cdcp.py b/dataset_builders/pie/cdcp/cdcp.py
index df2c4b2e..73e9c8bd 100644
--- a/dataset_builders/pie/cdcp/cdcp.py
+++ b/dataset_builders/pie/cdcp/cdcp.py
@@ -39,8 +39,8 @@ class CDCPDocument(TextBasedDocument):
def example_to_document(
example: Dict[str, Any],
- relation_label: Callable[[int], str],
- proposition_label: Callable[[int], str],
+ relation_label: datasets.ClassLabel,
+ proposition_label: datasets.ClassLabel,
):
document = CDCPDocument(id=example["id"], text=example["text"])
for proposition_dict in dl2ld(example["propositions"]):
@@ -67,8 +67,8 @@ def example_to_document(
def document_to_example(
document: CDCPDocument,
- relation_label: Callable[[int], str],
- proposition_label: Callable[[int], str],
+ relation_label: datasets.ClassLabel,
+ proposition_label: datasets.ClassLabel,
) -> Dict[str, Any]:
result = {"id": document.id, "text": document.text}
proposition2dict = {}
diff --git a/dataset_builders/pie/cdcp/requirements.txt b/dataset_builders/pie/cdcp/requirements.txt
new file mode 100644
index 00000000..96711063
--- /dev/null
+++ b/dataset_builders/pie/cdcp/requirements.txt
@@ -0,0 +1 @@
+pie-datasets>=0.3.0
diff --git a/src/pie_datasets/document/types.py b/src/pie_datasets/document/types.py
index 393a01b9..f8a2a353 100644
--- a/src/pie_datasets/document/types.py
+++ b/src/pie_datasets/document/types.py
@@ -2,33 +2,13 @@
import logging
from typing import Any, Dict, Optional
-from pytorch_ie.annotations import (
- BinaryRelation,
- LabeledMultiSpan,
- LabeledSpan,
- Span,
- _post_init_single_label,
-)
+from pytorch_ie.annotations import BinaryRelation, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field
from pytorch_ie.documents import TextBasedDocument, TokenBasedDocument
logger = logging.getLogger(__name__)
-# ========================= Annotation Types ========================= #
-
-
-@dataclasses.dataclass(eq=True, frozen=True)
-class Attribute(Annotation):
- target_annotation: Annotation
- label: str
- value: Optional[str] = None
- score: float = 1.0
-
-
-# ========================= Document Types ========================= #
-
-
@dataclasses.dataclass
class TokenDocumentWithLabeledSpans(TokenBasedDocument):
labeled_spans: AnnotationList[LabeledSpan] = annotation_field(target="tokens")
diff --git a/tests/dataset_builders/pie/test_cdcp.py b/tests/dataset_builders/pie/test_cdcp.py
index 89904bb7..758d3d7b 100644
--- a/tests/dataset_builders/pie/test_cdcp.py
+++ b/tests/dataset_builders/pie/test_cdcp.py
@@ -29,7 +29,7 @@
DATASET_NAME = "cdcp"
SPLIT_SIZES = {"train": 581, "test": 150}
HF_DATASET_PATH = CDCP.BASE_DATASET_PATH
-PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME # "pie/cdcp"
+PIE_DATASET_PATH = PIE_BASE_PATH / DATASET_NAME
DATA_PATH = FIXTURES_ROOT / "dataset_builders" / "cdcp_acl17.zip"
HF_EXAMPLE_00195 = {
@@ -103,7 +103,7 @@ def generate_document_kwargs(hf_dataset, split):
@pytest.fixture(scope="module")
def generated_document(hf_example, generate_document_kwargs):
- return example_to_document(hf_example, **generate_document_kwargs)
+ return CDCP()._generate_document(hf_example, **generate_document_kwargs)
def test_generated_document(generated_document, split):