Skip to content

Commit

Permalink
kickstart
Browse files Browse the repository at this point in the history
  • Loading branch information
idalr committed Nov 3, 2023
1 parent 679306c commit d894d44
Show file tree
Hide file tree
Showing 10 changed files with 3,024 additions and 2,160 deletions.
168 changes: 168 additions & 0 deletions dataset_builders/hf/cdcp/cdcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""The Cornell eRulemaking Corpus (CDCP) dataset for English Argumentation Mining."""
import glob
import json
from os.path import abspath, isdir
from pathlib import Path

import datasets

_CITATION = """\
@inproceedings{niculae-etal-2017-argument,
title = "Argument Mining with Structured {SVM}s and {RNN}s",
author = "Niculae, Vlad and
Park, Joonsuk and
Cardie, Claire",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P17-1091",
doi = "10.18653/v1/P17-1091",
pages = "985--995",
abstract = "We propose a novel factor graph model for argument mining, designed for settings in which the argumentative relations in a document do not necessarily form a tree structure. (This is the case in over 20{\\%} of the web comments dataset we release.) Our model jointly learns elementary unit type classification and argumentative relation prediction. Moreover, our model supports SVM and RNN parametrizations, can enforce structure constraints (e.g., transitivity), and can express dependencies between adjacent relations and propositions. Our approaches outperform unstructured baselines in both web comments and argumentative essay datasets.",
}
"""

_DESCRIPTION = "The CDCP dataset for English Argumentation Mining"

_HOMEPAGE = ""

_LICENSE = ""


# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URL = "https://facultystaff.richmond.edu/~jpark/data/cdcp_acl17.zip"

_VERSION = datasets.Version("1.0.0")

_SPAN_CLASS_LABELS = ["fact", "policy", "reference", "testimony", "value"]
_RELATION_CLASS_LABELS = ["evidence", "reason"]


class CDCP(datasets.GeneratorBasedBuilder):
"""CDCP is a argumentation mining dataset."""

BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]

DEFAULT_CONFIG_NAME = "default" # type: ignore

def _info(self):
features = datasets.Features(
{
"id": datasets.Value("string"),
"text": datasets.Value("string"),
"propositions": datasets.Sequence(
{
"start": datasets.Value("int32"),
"end": datasets.Value("int32"),
"label": datasets.ClassLabel(names=_SPAN_CLASS_LABELS),
# urls are replaced with the string "__URL__" in the text. This contains the original url.
"url": datasets.Value("string"),
}
),
"relations": datasets.Sequence(
{
"head": datasets.Value("int32"),
"tail": datasets.Value("int32"),
"label": datasets.ClassLabel(names=_RELATION_CLASS_LABELS),
}
),
}
)

return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
# It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
# By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive

if dl_manager.manual_dir is not None:
base_path = abspath(dl_manager.manual_dir)
if not isdir(base_path):
base_path = dl_manager.extract(base_path)
else:
base_path = dl_manager.download_and_extract(_URL)
base_path = Path(base_path) / "cdcp"

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN, gen_kwargs={"path": base_path / "train"}
),
datasets.SplitGenerator(
name=datasets.Split.TEST, gen_kwargs={"path": base_path / "test"}
),
]

def _generate_examples(self, path):
"""Yields examples."""
# This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
# It is in charge of opening the given file and yielding (key, example) tuples from the dataset
# The key is not important, it's more here for legacy reason (legacy from tfds)

_id = 0
text_file_names = sorted(glob.glob(f"{path}/*.txt"))
for text_file_name in text_file_names:
txt_fn = Path(text_file_name)
ann_fn = txt_fn.with_suffix(".ann.json")
with open(txt_fn, encoding="utf-8") as f:
text = f.read()
with open(ann_fn, encoding="utf-8") as f:
annotations = json.load(f)
# example content of annotations:
# {
# 'evidences': [[[8, 8], 7]],
# 'prop_labels': ['testimony', 'testimony', 'value'],
# 'prop_offsets': [[0, 114], [114, 209], [209, 235]],
# 'reasons': [[[2, 2], 1], [ 0, 0], 2]],
# 'evidences': [[[2, 2], 1], [ 0, 0], 2]],
# 'url': {
# "3": "http://usa.visa.com/personal/using_visa/checkout_fees/",
# "4": "http://usa.visa.com/download/merchants/surcharging-faq-by-merchants.pdf"
# }
# }
propositions = [
{
"start": start,
"end": end,
"label": label,
"url": annotations["url"].get(str(idx), ""),
}
for idx, ((start, end), label) in enumerate(
zip(annotations["prop_offsets"], annotations["prop_labels"])
)
]
relations = []
for (tail_first_idx, tail_last_idx), head_idx in annotations["evidences"]:
for tail_idx in range(tail_first_idx, tail_last_idx + 1):
relations.append({"head": head_idx, "tail": tail_idx, "label": "evidence"})
for (tail_first_idx, tail_last_idx), head_idx in annotations["reasons"]:
for tail_idx in range(tail_first_idx, tail_last_idx + 1):
relations.append({"head": head_idx, "tail": tail_idx, "label": "reason"})
yield _id, {
"id": txt_fn.stem,
"text": text,
"propositions": propositions,
"relations": relations,
}
_id += 1
158 changes: 158 additions & 0 deletions dataset_builders/pie/cdcp/cdcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import dataclasses
import logging
from typing import Any, Callable, Dict, List, Optional

import datasets
import pytorch_ie.data.builder
from pie_utils.document.processors.text_span_trimmer import trim_text_spans
from pytorch_ie.annotations import BinaryRelation, LabeledSpan
from pytorch_ie.core import Annotation, AnnotationList, Document, annotation_field
from pytorch_ie.documents import TextDocumentWithLabeledSpansAndBinaryRelations


log = logging.getLogger(__name__)


def dl2ld(dict_of_lists):
return [dict(zip(dict_of_lists, t)) for t in zip(*dict_of_lists.values())]


def ld2dl(list_of_dicts, keys: Optional[List[str]] = None, as_list: bool = False):
if keys is None:
keys = list_of_dicts[0].keys()
if as_list:
return [[d[k] for d in list_of_dicts] for k in keys]
else:
return {k: [d[k] for d in list_of_dicts] for k in keys}


@dataclasses.dataclass(frozen=True)
class Attribute(Annotation):
value: str
annotation: Annotation


@dataclasses.dataclass
class CDCPDocument(Document):
text: str
id: Optional[str] = None
metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
propositions: AnnotationList[LabeledSpan] = annotation_field(target="text")
relations: AnnotationList[BinaryRelation] = annotation_field(target="propositions")
urls: AnnotationList[Attribute] = annotation_field(target="propositions")


def example_to_document(
example: Dict[str, Any],
relation_int2str: Callable[[int], str],
proposition_int2str: Callable[[int], str],
):
document = CDCPDocument(id=example["id"], text=example["text"])
for proposition_dict in dl2ld(example["propositions"]):
proposition = LabeledSpan(
start=proposition_dict["start"],
end=proposition_dict["end"],
label=proposition_int2str(proposition_dict["label"]),
)
document.propositions.append(proposition)
if proposition_dict.get("url", "") != "":
url = Attribute(annotation=proposition, value=proposition_dict["url"])
document.urls.append(url)

for relation_dict in dl2ld(example["relations"]):
relation = BinaryRelation(
head=document.propositions[relation_dict["head"]],
tail=document.propositions[relation_dict["tail"]],
label=relation_int2str(relation_dict["label"]),
)
document.relations.append(relation)

return document


def document_to_example(
document: CDCPDocument,
relation_str2int: Callable[[str], int],
proposition_str2int: Callable[[str], int],
) -> Dict[str, Any]:
result = {"id": document.id, "text": document.text}
proposition2dict = {}
proposition2idx = {}
for idx, proposition in enumerate(document.propositions):
proposition2dict[proposition] = {
"start": proposition.start,
"end": proposition.end,
"label": proposition_str2int(proposition.label),
"url": "",
}
proposition2idx[proposition] = idx
for url in document.urls:
proposition2dict[url.annotation]["url"] = url.value

result["propositions"] = ld2dl(
proposition2dict.values(), keys=["start", "end", "label", "url"]
)

relations = [
{
"head": proposition2idx[relation.head],
"tail": proposition2idx[relation.tail],
"label": relation_str2int(relation.label),
}
for relation in document.relations
]
result["relations"] = ld2dl(relations, keys=["head", "tail", "label"])

return result


def convert_to_text_document_with_labeled_spans_and_binary_relations(
document: CDCPDocument,
verbose: bool = True,
) -> TextDocumentWithLabeledSpansAndBinaryRelations:
doc_simplified = document.as_type(
TextDocumentWithLabeledSpansAndBinaryRelations,
field_mapping={"propositions": "labeled_spans", "relations": "binary_relations"},
)
result = trim_text_spans(
doc_simplified,
layer="labeled_spans",
verbose=verbose,
)
return result


class CDCPConfig(datasets.BuilderConfig):
"""BuilderConfig for CDCP."""

def __init__(self, **kwargs):
"""BuilderConfig for CDCP.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(**kwargs)


class CDCP(pytorch_ie.data.builder.GeneratorBasedBuilder):
DOCUMENT_TYPE = CDCPDocument

DOCUMENT_CONVERTERS = {
TextDocumentWithLabeledSpansAndBinaryRelations: convert_to_text_document_with_labeled_spans_and_binary_relations
}

BASE_DATASET_PATH = "DFKI-SLT/cdcp"

BUILDER_CONFIGS = [datasets.BuilderConfig(name="default")]

DEFAULT_CONFIG_NAME = "default" # type: ignore

def _generate_document_kwargs(self, dataset):
return {
"relation_int2str": dataset.features["relations"].feature["label"].int2str,
"proposition_int2str": dataset.features["propositions"].feature["label"].int2str,
}

def _generate_document(self, example, relation_int2str, proposition_int2str):
return example_to_document(
example, relation_int2str=relation_int2str, proposition_int2str=proposition_int2str
)
Loading

0 comments on commit d894d44

Please sign in to comment.